From 29eba93bfaf6f69c924a17fe24eccdea281fe4dc Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Tue, 16 Jan 2024 17:26:46 -0800
Subject: [PATCH 01/20] Path to new Questa

---
 setup.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.sh b/setup.sh
index b1ecbd84f..e1d4e6cd3 100755
--- a/setup.sh
+++ b/setup.sh
@@ -16,8 +16,7 @@ echo \$WALLY set to ${WALLY}
 # Must edit these based on your local environment.  Ask your sysadmin.
 export MGLS_LICENSE_FILE=27002@zircon.eng.hmc.edu                   # Change this to your Siemens license server
 export SNPSLMD_LICENSE_FILE=27020@zircon.eng.hmc.edu                # Change this to your Synopsys license server
-export QUESTA_HOME=/cad/mentor/questa_sim-2022.4_2/questasim        # Change this for your path to Questa, excluding bin
-#export QUESTA_HOME=/cad/mentor/questa_sim-2022.4_3/questasim        # Change this for your path to Questa, excluding bin
+export QUESTA_HOME=/cad/mentor/questa_sim-2023.4/questasim        # Change this for your path to Questa, excluding bin
 export SNPS_HOME=/cad/synopsys/SYN                                  # Change this for your path to Design Compiler, excluding bin
 
 # Path to RISC-V Tools

From 0588d611ead1deb5379be81a93b1f7ebb5859dfa Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Tue, 16 Jan 2024 17:27:40 -0800
Subject: [PATCH 02/20] Zfa fli support working for F and D

---
 src/fpu/fctrl.sv       |  7 +++++--
 src/fpu/fpu.sv         | 11 ++++++++++-
 testbench/testbench.sv |  4 +++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/fpu/fctrl.sv b/src/fpu/fctrl.sv
index 999837889..d4cc60e87 100755
--- a/src/fpu/fctrl.sv
+++ b/src/fpu/fctrl.sv
@@ -143,14 +143,16 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                                                 ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0; // fmv.x.w/d/h/q  fp to int register
                       7'b11110??: if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
                                                 ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0; // fmv.w/d/h/q.x  int to fp reg
+                                  else if (P.ZFA_SUPPORTED & Funct3D == 3'b000 & Rs2D == 5'b00001) 
+                                                ControlsD = `FCTRLW'b1_0_00_00_111_0_0_0; // fli
                       7'b0100000: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b00)
                                                 ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0; // fcvt.s.(d/q/h)
                       7'b0100001: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b01)
                                                 ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0; // fcvt.d.(s/h/q)
                       // coverage off
-                      // Not covered in testing because rv64gc does not support half or quad precision
                       7'b0100010: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b10)
                                                 ControlsD = `FCTRLW'b1_0_01_00_010_0_0_0; // fcvt.h.(s/d/q)
+                      // Not covered in testing because rv64gc does not support quad precision
                       7'b0100011: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b11)
                                                 ControlsD = `FCTRLW'b1_0_01_00_011_0_0_0; // fcvt.q.(s/h/d)
                       // coverage on
@@ -179,7 +181,6 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                                     5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.d  d->lu
                                   endcase
                       // coverage off
-                      // Not covered in testing because rv64gc does not support half or quad precision
                       7'b1101010: case(Rs2D)
                                     5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.h.w   w->h
                                     5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.h.wu wu->h
@@ -192,6 +193,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                                     5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1; // fcvt.l.h   h->l
                                     5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.h  h->lu
                                   endcase
+                      // Not covered in testing because rv64gc does not support quad precision
                       7'b1101011: case(Rs2D)
                                     5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.q.w   w->q
                                     5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.q.wu wu->q
@@ -274,6 +276,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   //        011 - mv to fp        01
   //        110 - min             10
   //        101 - max             10
+  //        111 - fli             11
 
   //  OpCtrl:
   //    Fma: {not multiply-add?, negate prod?, negate Z?}
diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index 45af38c0c..8be0e4488 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -160,6 +160,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic                        StallUnpackedM;                     // Stall unpacker outputs during multicycle fdivsqrt
   logic [P.FLEN-1:0]           SgnExtXE;                           // Sign-extended X input for move to integer
   logic                        mvsgn;                              // sign bit for extending move
+  logic [P.FLEN-1:0]           FliResE;                            // Floating-point load immediate value
 
   //////////////////////////////////////////////////////////////////////////////////////////
   // Decode Stage: fctrl decoder, read register file
@@ -263,6 +264,14 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
     .ToInt(FWriteIntE), .XZero(XZeroE), .Fmt(FmtE), .Ce(CeE), .ShiftAmt(CvtShiftAmtE), 
     .ResSubnormUf(CvtResSubnormUfE), .Cs(CsE), .IntZero(IntZeroE), .LzcIn(CvtLzcInE));
 
+  // floating-point load immediate: fli
+  if (P.ZFA_SUPPORTED) begin
+    logic [4:0] Rs1E;
+    
+    flopenrc #(5) Rs1EReg(clk, reset, FlushE, ~StallE, InstrD[19:15], Rs1E);
+    fli #(P) fli(.Rs1(Rs1E), .Fmt(FmtE), .Imm(FliResE)); 
+  end else assign FliResE = '0;
+
   // NaN Box SrcA to convert integer to requested FP size for fmv.*.x
   if(P.FPSIZES == 1) assign AlignedSrcAE = {{P.FLEN-P.XLEN{1'b1}}, ForwardedSrcAE};
   else if(P.FPSIZES == 2) 
@@ -276,7 +285,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   end
 
   // select a result that may be written to the FP register
-  mux3  #(P.FLEN) FResMux(SgnResE, AlignedSrcAE, CmpFpResE, {OpCtrlE[2], &OpCtrlE[1:0]}, PreFpResE);
+  mux4  #(P.FLEN) FResMux(SgnResE, AlignedSrcAE, CmpFpResE, FliResE, {OpCtrlE[2], &OpCtrlE[1:0]}, PreFpResE);
   assign PreNVE = CmpNVE&(OpCtrlE[2]|FWriteIntE);
 
   // select the result that may be written to the integer register with fmv.x.*
diff --git a/testbench/testbench.sv b/testbench/testbench.sv
index efd4ea637..87b603288 100644
--- a/testbench/testbench.sv
+++ b/testbench/testbench.sv
@@ -128,7 +128,8 @@ module testbench;
         "arch64zicboz":  if (P.ZICBOZ_SUPPORTED)  tests = arch64zicboz;
         "arch64zcb":     if (P.ZCB_SUPPORTED)     tests = arch64zcb;
         "arch64zfh":     if (P.ZFH_SUPPORTED)     tests = arch64zfh;
-//        "arch64zfa":     if (P.ZFA_SUPPORTED)     tests = arch64zfa;
+        "arch64zfaf":    if (P.ZFA_SUPPORTED)     tests = arch64zfaf;
+        "arch64zfad":    if (P.ZFA_SUPPORTED & P.D_SUPPORTED)  tests = arch64zfad;
       endcase 
     end else begin // RV32
       case (TEST)
@@ -165,6 +166,7 @@ module testbench;
         "arch32zcb":     if (P.ZCB_SUPPORTED)     tests = arch32zcb;
         "arch32zfh":     if (P.ZFH_SUPPORTED)     tests = arch32zfh;
         "arch32zfaf":    if (P.ZFA_SUPPORTED)     tests = arch32zfaf;
+        "arch32zfad":    if (P.ZFA_SUPPORTED & P.D_SUPPORTED)  tests = arch32zfad;
       endcase
     end
     if (tests.size() == 0) begin

From 9d57002c070461ec9518fa811936d4a975846fc6 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Tue, 16 Jan 2024 17:27:59 -0800
Subject: [PATCH 03/20] Zfa fli support working for F and D (add fli.sv module)

---
 src/fpu/fli.sv | 219 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 src/fpu/fli.sv

diff --git a/src/fpu/fli.sv b/src/fpu/fli.sv
new file mode 100644
index 000000000..e61415388
--- /dev/null
+++ b/src/fpu/fli.sv
@@ -0,0 +1,219 @@
+///////////////////////////////////////////
+// fli.sv
+//
+// Written: David_Harris@hmc.edu
+// Modified: 1/16/2024
+//
+// Purpose: Floating-point float immediate
+// 
+// Documentation: RISC-V System on Chip Design Chapter 16
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module fli import cvw::*;  #(parameter cvw_t P) (
+  input  logic [4:0]        Rs1,           // Index of immediate to select
+  input  logic [1:0]        Fmt,           // 00 = single, 01 = double, 10 = half, 11 = quad
+  output logic [P.FLEN-1:0] Imm            // Immediate output
+);
+
+  logic [P.FLEN-1:0] HImmBox, SImmBox, DImmBox, QImmBox;
+
+  // select constant for each immediate size supported
+
+  ////////////////////////////
+  // half
+  ////////////////////////////
+  
+  if (P.ZFH_SUPPORTED) begin
+    logic [15:0] HImm;
+    always_comb begin
+        case(Rs1) 
+            0:  HImm = 16'hBC00;
+            1:  HImm = 16'h0400;
+            2:  HImm = 16'h0100;
+            3:  HImm = 16'h0200;
+            4:  HImm = 16'h1C00;
+            5:  HImm = 16'h2000;
+            6:  HImm = 16'h2C00;
+            7:  HImm = 16'h3000;
+            8:  HImm = 16'h3400;
+            9:  HImm = 16'h3500;
+            10: HImm = 16'h3600;
+            11: HImm = 16'h3700;
+            12: HImm = 16'h3800;
+            13: HImm = 16'h3900;
+            14: HImm = 16'h3A00;
+            15: HImm = 16'h3B00;
+            16: HImm = 16'h3C00;
+            17: HImm = 16'h3D00;
+            18: HImm = 16'h3E00;
+            19: HImm = 16'h3F00;
+            20: HImm = 16'h4000;
+            21: HImm = 16'h4100;
+            22: HImm = 16'h4200;
+            23: HImm = 16'h4400;
+            24: HImm = 16'h4800;
+            25: HImm = 16'h4C00;
+            26: HImm = 16'h5800;
+            27: HImm = 16'h5C00;
+            28: HImm = 16'h7800;
+            29: HImm = 16'h7C00;
+            30: HImm = 16'h7C00;
+            31: HImm = 16'h7E00;
+        endcase
+    end
+    assign HImmBox = {{(P.FLEN-16){1'b1}}, HImm}; // NaN-box HImm
+  end else assign HImmBox = '0;
+
+  ////////////////////////////
+  // single
+  ////////////////////////////
+
+    logic [31:0] SImm;
+     always_comb begin
+       case(Rs1) 
+            0:  SImm = 32'hBF800000;
+            1:  SImm = 32'h00800000;
+            2:  SImm = 32'h37800000;
+            3:  SImm = 32'h38000000;
+            4:  SImm = 32'h3B800000;
+            5:  SImm = 32'h3C000000;
+            6:  SImm = 32'h3D800000;
+            7:  SImm = 32'h3E000000;
+            8:  SImm = 32'h3E800000;
+            9:  SImm = 32'h3EA00000;
+            10: SImm = 32'h3EC00000;
+            11: SImm = 32'h3EE00000;
+            12: SImm = 32'h3F000000;
+            13: SImm = 32'h3F200000;
+            14: SImm = 32'h3F400000;
+            15: SImm = 32'h3F600000;
+            16: SImm = 32'h3F800000;
+            17: SImm = 32'h3FA00000;
+            18: SImm = 32'h3FC00000;
+            19: SImm = 32'h3FE00000;
+            20: SImm = 32'h40000000;
+            21: SImm = 32'h40200000;
+            22: SImm = 32'h40400000;
+            23: SImm = 32'h40800000;
+            24: SImm = 32'h41000000;
+            25: SImm = 32'h41800000;
+            26: SImm = 32'h43000000;
+            27: SImm = 32'h43800000;
+            28: SImm = 32'h47000000;
+            29: SImm = 32'h47800000;
+            30: SImm = 32'h7F800000;
+            31: SImm = 32'h7FC00000;
+        endcase
+    end
+    assign SImmBox = {{(P.FLEN-32){1'b1}}, SImm}; // NaN-box SImm
+
+  ////////////////////////////
+  // double
+  ////////////////////////////
+  
+  if (P.D_SUPPORTED) begin
+    logic [63:0] DImm;
+    always_comb begin
+        case(Rs1) 
+            0:  DImm = 64'hBFF0000000000000;
+            1:  DImm = 64'h0010000000000000;
+            2:  DImm = 64'h3EF0000000000000;
+            3:  DImm = 64'h3F00000000000000;
+            4:  DImm = 64'h3F70000000000000;
+            5:  DImm = 64'h3F80000000000000;
+            6:  DImm = 64'h3FB0000000000000;
+            7:  DImm = 64'h3FC0000000000000;
+            8:  DImm = 64'h3FD0000000000000;
+            9:  DImm = 64'h3FD4000000000000;
+            10: DImm = 64'h3FD8000000000000;
+            11: DImm = 64'h3FDC000000000000;
+            12: DImm = 64'h3FE0000000000000;
+            13: DImm = 64'h3FE4000000000000;
+            14: DImm = 64'h3FE8000000000000;
+            15: DImm = 64'h3FEC000000000000;
+            16: DImm = 64'h3FF0000000000000;
+            17: DImm = 64'h3FF4000000000000;
+            18: DImm = 64'h3FF8000000000000;
+            19: DImm = 64'h3FFC000000000000;
+            20: DImm = 64'h4000000000000000;
+            21: DImm = 64'h4004000000000000;
+            22: DImm = 64'h4008000000000000;
+            23: DImm = 64'h4010000000000000;
+            24: DImm = 64'h4020000000000000;
+            25: DImm = 64'h4030000000000000;
+            26: DImm = 64'h4060000000000000;
+            27: DImm = 64'h4070000000000000;
+            28: DImm = 64'h40E0000000000000;
+            29: DImm = 64'h40F0000000000000;
+            30: DImm = 64'h7FF0000000000000;
+            31: DImm = 64'h7FF8000000000000;
+        endcase
+    end
+    assign DImmBox = {{(P.FLEN-64){1'b1}}, DImm}; // NaN-box DImm
+  end else assign DImmBox = '0;
+  
+    ////////////////////////////
+  // double
+  ////////////////////////////
+  
+  if (P.Q_SUPPORTED) begin
+    logic [63:0] QImm;
+    always_comb begin
+        case(Rs1) 
+            0:  QImm = 128'hBFFF0000000000000000000000000000;
+            1:  QImm = 128'h00010000000000000000000000000000;
+            2:  QImm = 128'h3FEF0000000000000000000000000000;
+            3:  QImm = 128'h3FF00000000000000000000000000000;
+            4:  QImm = 128'h3FF70000000000000000000000000000;
+            5:  QImm = 128'h3FF80000000000000000000000000000;
+            6:  QImm = 128'h3FFB0000000000000000000000000000;
+            7:  QImm = 128'h3FFC0000000000000000000000000000;
+            8:  QImm = 128'h3FFD0000000000000000000000000000;
+            9:  QImm = 128'h3FFD4000000000000000000000000000;
+            10: QImm = 128'h3FFD8000000000000000000000000000;
+            11: QImm = 128'h3FFDC000000000000000000000000000;
+            12: QImm = 128'h3FFE0000000000000000000000000000;
+            13: QImm = 128'h3FFE4000000000000000000000000000;
+            14: QImm = 128'h3FFE8000000000000000000000000000;
+            15: QImm = 128'h3FFEC000000000000000000000000000;
+            16: QImm = 128'h3FFF0000000000000000000000000000;
+            17: QImm = 128'h3FFF4000000000000000000000000000;
+            18: QImm = 128'h3FFF8000000000000000000000000000;
+            19: QImm = 128'h3FFFC000000000000000000000000000;
+            20: QImm = 128'h40000000000000000000000000000000;
+            21: QImm = 128'h40004000000000000000000000000000;
+            22: QImm = 128'h40008000000000000000000000000000;
+            23: QImm = 128'h40010000000000000000000000000000;
+            24: QImm = 128'h40020000000000000000000000000000;
+            25: QImm = 128'h40030000000000000000000000000000;
+            26: QImm = 128'h40060000000000000000000000000000;
+            27: QImm = 128'h40070000000000000000000000000000;
+            28: QImm = 128'h400E0000000000000000000000000000;
+            29: QImm = 128'h400F0000000000000000000000000000;
+            30: QImm = 128'h7FFF0000000000000000000000000000;
+            31: QImm = 128'h7FFF8000000000000000000000000000;
+        endcase
+    end
+    assign QImmBox = QImm; // NaN-box QImm trivial because Q is longest format
+  end else assign QImmBox = '0;
+
+  mux4 #(P.FLEN) flimux(SImmBox, DImmBox, HImmBox, QImmBox, Fmt, Imm); // select immediate based on format
+
+endmodule

From 8654375f26504d4efe1b772421f00e87d49a444b Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Tue, 16 Jan 2024 20:03:54 -0800
Subject: [PATCH 04/20] Zfa fminm/fmaxm/fltq/fleq implemented and tested

---
 src/fpu/fcmp.sv    |  49 +++++++-----
 src/fpu/fctrl.sv   | 191 +++++++++++++++++++++++++++------------------
 src/fpu/fpu.sv     |   5 +-
 testbench/tests.vh |  58 ++++++++++++--
 4 files changed, 198 insertions(+), 105 deletions(-)

diff --git a/src/fpu/fcmp.sv b/src/fpu/fcmp.sv
index 9d0d582b5..e330f1fda 100755
--- a/src/fpu/fcmp.sv
+++ b/src/fpu/fcmp.sv
@@ -36,6 +36,7 @@
 module fcmp import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0]   Fmt,           // format of fp number
   input  logic [2:0]             OpCtrl,        // see above table
+  input  logic                   Zfa,           // Zfa variants: fminm, fmaxm, fleq, fltq
   input  logic                   Xs, Ys,        // input signs
   input  logic [P.NE-1:0]        Xe, Ye,        // input exponents
   input  logic [P.NF:0]          Xm, Ym,        // input mantissa
@@ -70,8 +71,8 @@ module fcmp import cvw::*;  #(parameter cvw_t P) (
         3'b110: CmpNV = EitherSNaN; //min 
         3'b101: CmpNV = EitherSNaN; //max
         3'b010: CmpNV = EitherSNaN; //equal
-        3'b001: CmpNV = EitherNaN;  //less than
-        3'b011: CmpNV = EitherNaN;  //less than or equal
+        3'b001: CmpNV = Zfa ? EitherSNaN : EitherNaN;  // fltq / flt perform CompareQuietLess / CompareSignalingLess differing on when to set invalid
+        3'b011: CmpNV = Zfa ? EitherSNaN : EitherNaN;  // fleq / fle differ on when to set invalid
         default: CmpNV = 1'bx;
     endcase
   end 
@@ -128,23 +129,35 @@ module fcmp import cvw::*;  #(parameter cvw_t P) (
   //    - if one is a NaN output the non-NaN
   always_comb
     if(OpCtrl[0]) // MAX
-        if(XNaN)
-          if(YNaN)    CmpFpRes = NaNRes;   // X = NaN Y = NaN
-          else        CmpFpRes = Y;        // X = NaN Y != NaN
-        else
-          if(YNaN)    CmpFpRes = X;        // X != NaN Y = NaN
-          else // X,Y != NaN
-              if(LT)  CmpFpRes = Y;        // X < Y
-              else    CmpFpRes = X;        // X > Y
+        if (Zfa & P.ZFA_SUPPORTED) // fmaxm perform IEEE754 maxNum that produce NaN if either input is NaN
+          if (XNaN | YNaN) CmpFpRes = NaNRes; // either input is NaN
+          else
+            if (LT) CmpFpRes = Y; // X < Y
+            else    CmpFpRes = X; // X > Y
+        else // fmax performs IEEE754 maxNumber that produces NaN if both inputs are NaN
+          if(XNaN)
+            if(YNaN)    CmpFpRes = NaNRes;   // X = NaN Y = NaN
+            else        CmpFpRes = Y;        // X = NaN Y != NaN
+          else
+            if(YNaN)    CmpFpRes = X;        // X != NaN Y = NaN
+            else // X,Y != NaN
+                if(LT)  CmpFpRes = Y;        // X < Y
+                else    CmpFpRes = X;        // X > Y
     else  // MIN
-        if(XNaN)
-          if(YNaN)    CmpFpRes = NaNRes;   // X = NaN Y = NaN
-          else        CmpFpRes = Y;        // X = NaN Y != NaN
-        else
-          if(YNaN)    CmpFpRes = X;        // X != NaN Y = NaN
-          else // X,Y != NaN
-              if(LT)  CmpFpRes = X;        // X < Y
-              else    CmpFpRes = Y;        // X > Y
+        if (Zfa & P.ZFA_SUPPORTED) // fminm perform IEEE754 minNum that produce NaN if either input is NaN
+          if (XNaN | YNaN) CmpFpRes = NaNRes; // either input is NaN
+          else
+            if (LT) CmpFpRes = X; // X < Y
+            else    CmpFpRes = Y; // X > Y
+        else // fmin performs IEEE754 minNumber that produces NaN if both inputs are NaN
+          if(XNaN)
+            if(YNaN)    CmpFpRes = NaNRes;   // X = NaN Y = NaN
+            else        CmpFpRes = Y;        // X = NaN Y != NaN
+          else
+            if(YNaN)    CmpFpRes = X;        // X != NaN Y = NaN
+            else // X,Y != NaN
+                if(LT)  CmpFpRes = X;        // X < Y
+                else    CmpFpRes = Y;        // X > Y
                                   
   // LT/LE/EQ
   //    - -0 = 0
diff --git a/src/fpu/fctrl.sv b/src/fpu/fctrl.sv
index d4cc60e87..9f60a692f 100755
--- a/src/fpu/fctrl.sv
+++ b/src/fpu/fctrl.sv
@@ -54,6 +54,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   output logic [1:0]           PostProcSelE, PostProcSelM,         // select result in the post processing unit
   output logic [1:0]           FResSelE, FResSelM, FResSelW,       // Select one of the results that finish in the memory stage
   output logic                 FPUActiveE,                         // FP instruction being executed
+  output logic                 ZfaE,                               // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod)
   // register control signals
   output logic                 FRegWriteE, FRegWriteM, FRegWriteW, // FP register write enable
   output logic                 FWriteIntE, FWriteIntM,             // Write to integer register
@@ -64,7 +65,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   output logic                 FDivStartE, IDivStartE              // Start division or squareroot
   );
 
-  `define FCTRLW 12
+  `define FCTRLW 13
 
   logic [`FCTRLW-1:0]          ControlsD;                          // control signals
   logic                        FRegWriteD;                         // FP register write enable
@@ -79,6 +80,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   logic                        SupportedFmt;                       // is the format supported
   logic                        SupportedFmt2;                      // is the source format supported for fp -> fp
   logic                        FCvtIntD, FCvtIntM;                 // convert to integer opperation
+  logic                        ZfaD;                               // Zfa variants of instructions
 
   // FPU Instruction Decoder
   assign Fmt = Funct7D[1:0];
@@ -91,129 +93,164 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                          (Fmt2 == 2'b10 & P.ZFH_SUPPORTED) | (Fmt2 == 2'b11 & P.Q_SUPPORTED));
 
   // decode the instruction                       
-  // FRegWrite_FWriteInt_FResSel_PostProcSel_FOpCtrl_FDivStart_IllegalFPUInstr_FCvtInt
+  // FRegWrite_FWriteInt_FResSel_PostProcSel_FOpCtrl_FDivStart_IllegalFPUInstr_FCvtInt_Zfa
   always_comb
     if (STATUS_FS == 2'b00) // FPU instructions are illegal when FPU is disabled
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0;
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0;
     else if (OpD != 7'b0000111 & OpD != 7'b0100111 & ~SupportedFmt) 
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0; // for anything other than loads and stores, check for supported format
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0; // for anything other than loads and stores, check for supported format
     else begin 
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0; // default: non-implemented instruction
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0; // default: non-implemented instruction
       /* verilator lint_off CASEINCOMPLETE */   // default value above has priority so no other default needed
       case(OpD)
         7'b0000111: case(Funct3D)
-                      3'b010:                       ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0; // flw
-                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0; // fld
-                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0; // flq
-                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0; // flh
+                      3'b010:                       ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flw
+                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // fld
+                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flq
+                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flh
                     endcase
         7'b0100111: case(Funct3D)
-                      3'b010:                       ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0; // fsw
-                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0; // fsd
-                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0; // fsq
-                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0; // fsh
+                      3'b010:                       ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsw
+                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsd
+                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsq
+                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsh
                     endcase
-        7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0_0; // fmadd
-        7'b1000111:   ControlsD = `FCTRLW'b1_0_01_10_001_0_0_0; // fmsub
-        7'b1001011:   ControlsD = `FCTRLW'b1_0_01_10_010_0_0_0; // fnmsub
-        7'b1001111:   ControlsD = `FCTRLW'b1_0_01_10_011_0_0_0; // fnmadd
+        7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0_0_0; // fmadd
+        7'b1000111:   ControlsD = `FCTRLW'b1_0_01_10_001_0_0_0_0; // fmsub
+        7'b1001011:   ControlsD = `FCTRLW'b1_0_01_10_010_0_0_0_0; // fnmsub
+        7'b1001111:   ControlsD = `FCTRLW'b1_0_01_10_011_0_0_0_0; // fnmadd
         7'b1010011: casez(Funct7D)
-                      7'b00000??: ControlsD = `FCTRLW'b1_0_01_10_110_0_0_0; // fadd
-                      7'b00001??: ControlsD = `FCTRLW'b1_0_01_10_111_0_0_0; // fsub
-                      7'b00010??: ControlsD = `FCTRLW'b1_0_01_10_100_0_0_0; // fmul
-                      7'b00011??: ControlsD = `FCTRLW'b1_0_01_01_xx0_1_0_0; // fdiv
-                      7'b01011??: if (Rs2D == 5'b0000) ControlsD = `FCTRLW'b1_0_01_01_xx1_1_0_0; // fsqrt
+                      7'b00000??: ControlsD = `FCTRLW'b1_0_01_10_110_0_0_0_0; // fadd
+                      7'b00001??: ControlsD = `FCTRLW'b1_0_01_10_111_0_0_0_0; // fsub
+                      7'b00010??: ControlsD = `FCTRLW'b1_0_01_10_100_0_0_0_0; // fmul
+                      7'b00011??: ControlsD = `FCTRLW'b1_0_01_01_xx0_1_0_0_0; // fdiv
+                      7'b01011??: if (Rs2D == 5'b0000) ControlsD = `FCTRLW'b1_0_01_01_xx1_1_0_0_0; // fsqrt
                       7'b00100??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_000_0_0_0; // fsgnj
-                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_001_0_0_0; // fsgnjn
-                                    3'b010:  ControlsD = `FCTRLW'b1_0_00_00_010_0_0_0; // fsgnjx
+                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_000_0_0_0_0; // fsgnj
+                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_001_0_0_0_0; // fsgnjn
+                                    3'b010:  ControlsD = `FCTRLW'b1_0_00_00_010_0_0_0_0; // fsgnjx
                                   endcase
                       7'b00101??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0; // fmin
-                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0; // fmax
+                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_0; // fmin
+                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_0; // fmax
+                                    3'b010:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_1; // fminm  (Zfa)
+                                    3'b011:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_1; // fmaxm  (Zfa)
                                   endcase
                       7'b10100??: case(Funct3D)
-                                    3'b010:  ControlsD = `FCTRLW'b0_1_00_00_010_0_0_0; // feq
-                                    3'b001:  ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0; // flt
-                                    3'b000:  ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0; // fle
+                                    3'b000:  ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_0; // fle
+                                    3'b001:  ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_0; // flt
+                                    3'b010:  ControlsD = `FCTRLW'b0_1_00_00_010_0_0_0_0; // feq
+                                    3'b100:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_1; // fleq  (Zfa)
+                                    3'b101:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_1; // fltq  (Zfa)
                                   endcase
                       7'b11100??: if (Funct3D == 3'b001 & Rs2D == 5'b00000)          
-                                                ControlsD = `FCTRLW'b0_1_10_00_000_0_0_0; // fclass
+                                                ControlsD = `FCTRLW'b0_1_10_00_000_0_0_0_0; // fclass
                                   else if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
-                                                ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0; // fmv.x.w/d/h/q  fp to int register
+                                                ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_0; // fmv.x.w/d/h/q  fp to int register
+                                  else if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct7D[1:0] == 2'b01 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
+                                                  ControlsD = '0; // fmvh.x.d  (Zfa) *** needs values for all moves
+                                  // coverage off    Q not supported in RV64GC
+                                  else if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct7D[1:0] == 2'b11 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
+                                                  ControlsD = '0; // fmvh.x.q  (Zfa)
+                                  // coverage on
                       7'b11110??: if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
-                                                ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0; // fmv.w/d/h/q.x  int to fp reg
+                                                ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0_0; // fmv.w/d/h/q.x  int to fp reg
                                   else if (P.ZFA_SUPPORTED & Funct3D == 3'b000 & Rs2D == 5'b00001) 
-                                                ControlsD = `FCTRLW'b1_0_00_00_111_0_0_0; // fli
+                                                ControlsD = `FCTRLW'b1_0_00_00_111_0_0_0_1; // fli  (Zfa)
                       7'b0100000: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b00)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0; // fcvt.s.(d/q/h)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_0; // fcvt.s.(d/q/h)
+                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.s  (Zfa) *** needs ctrl for all rounds
+                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.s  (Zfa) *** needs ctrl for all rounds
                       7'b0100001: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b01)
-                                                ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0; // fcvt.d.(s/h/q)
-                      // coverage off
+                                                ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0_0; // fcvt.d.(s/h/q)
+                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.d  (Zfa)
+                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.d  (Zfa)
                       7'b0100010: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b10)
-                                                ControlsD = `FCTRLW'b1_0_01_00_010_0_0_0; // fcvt.h.(s/d/q)
+                                                ControlsD = `FCTRLW'b1_0_01_00_010_0_0_0_0; // fcvt.h.(s/d/q)
+                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.h  (Zfa)
+                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.h  (Zfa)
+                      // coverage off
                       // Not covered in testing because rv64gc does not support quad precision
                       7'b0100011: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b11)
-                                                ControlsD = `FCTRLW'b1_0_01_00_011_0_0_0; // fcvt.q.(s/h/d)
+                                                ControlsD = `FCTRLW'b1_0_01_00_011_0_0_0_0; // fcvt.q.(s/h/d)
+                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.q  (Zfa)
+                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.q  (Zfa)
                       // coverage on
                       7'b1101000: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.s.w   w->s
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.s.wu wu->s
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0; // fcvt.s.l   l->s
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0; // fcvt.s.lu lu->s
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.s.w   w->s
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.s.wu wu->s
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.s.l   l->s
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.s.lu lu->s
                                   endcase
                       7'b1100000: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1; // fcvt.w.s   s->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1; // fcvt.wu.s  s->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1; // fcvt.l.s   s->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.s  s->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.s   s->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.s  s->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.s   s->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.s  s->lu
                                   endcase
                       7'b1101001: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.d.w   w->d
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.d.wu wu->d
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0; // fcvt.d.l   l->d
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0; // fcvt.d.lu lu->d
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.d.w   w->d
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.d.wu wu->d
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.d.l   l->d
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.d.lu lu->d
                                   endcase
                       7'b1100001: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1; // fcvt.w.d   d->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1; // fcvt.wu.d  d->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1; // fcvt.l.d   d->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.d  d->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.d   d->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.d  d->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.d   d->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.d  d->lu
+                                    5'b01000: if (P.ZFA_SUPPORTED & P.D_SUPPORTED & Funct3D == 3'b001) 
+                                                 ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_1; // fcvtmod.w.d (Zfa)
                                   endcase
-                      // coverage off
                       7'b1101010: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.h.w   w->h
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.h.wu wu->h
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0; // fcvt.h.l   l->h
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0; // fcvt.h.lu lu->h
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.h.w   w->h
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.h.wu wu->h
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.h.l   l->h
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.h.lu lu->h
                                   endcase
                       7'b1100010: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1; // fcvt.w.h   h->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1; // fcvt.wu.h  h->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1; // fcvt.l.h   h->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.h  h->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.h   h->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.h  h->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.h   h->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.h  h->lu
                                   endcase
                       // Not covered in testing because rv64gc does not support quad precision
+                      // coverage off
                       7'b1101011: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.q.w   w->q
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.q.wu wu->q
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0; // fcvt.q.l   l->q
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0; // fcvt.q.lu lu->q
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.q.w   w->q
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.q.wu wu->q
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.q.l   l->q
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.q.lu lu->q
                                   endcase
                       7'b1100011: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1; // fcvt.w.q   q->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1; // fcvt.wu.q  q->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1; // fcvt.l.q   q->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.q  q->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.q   q->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.q  q->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.q   q->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.q  q->lu
                                   endcase
                       // coverage on
-                    endcase
+                      7'b1011001: if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct3D == 3'b000) 
+                                                  ControlsD = '0; // fmvp.d.x  (Zfa)
+                      // Not covered in testing because rv64gc does not support quad precision
+                      // coverage off
+                      7'b1011011: if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct3D == 3'b000) 
+                                                  ControlsD = '0; // fmvp.q.x  (Zfa)
+                      // coverage on
+                   endcase
       endcase
     end
     /* verilator lint_on CASEINCOMPLETE */
 
   // unswizzle control bits
-  assign #1 {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, FDivStartD, IllegalFPUInstrD, FCvtIntD} = ControlsD;
+  assign #1 {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, FDivStartD, IllegalFPUInstrD, FCvtIntD, ZfaD} = ControlsD;
   
   // rounding modes:
   //    000 - round to nearest, ties to even
@@ -313,9 +350,9 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   assign Adr3D = InstrD[31:27];
  
   // D/E pipleine register
-  flopenrc #(14+P.FMTBITS) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-              {FRegWriteD, PostProcSelD, FResSelD, FrmD, FmtD, OpCtrlD, FWriteIntD, FCvtIntD, ~IllegalFPUInstrD},
-              {FRegWriteE, PostProcSelE, FResSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, FPUActiveE});
+  flopenrc #(15+P.FMTBITS) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+              {FRegWriteD, PostProcSelD, FResSelD, FrmD, FmtD, OpCtrlD, FWriteIntD, FCvtIntD, ZfaD, ~IllegalFPUInstrD},
+              {FRegWriteE, PostProcSelE, FResSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE, FPUActiveE});
   flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {Adr1D, Adr2D, Adr3D}, {Adr1E, Adr2E, Adr3E});
   flopenrc #(1) DEFDivStartReg(clk, reset, FlushE, ~StallE|FDivBusyE, FDivStartD, FDivStartE);
   flopenrc #(3) DEEnReg(clk, reset, FlushE, ~StallE, {XEnD, YEnD, ZEnD}, {XEnE, YEnE, ZEnE});
diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index 8be0e4488..bd387f5d7 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -83,6 +83,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic                        XEnE, YEnE, ZEnE;                   // X, Y, Z inputs used for current operation
   logic                        FRegWriteE;                         // Write floating-point register
   logic                        FPUActiveE;                         // FP instruction being executed
+  logic                        ZfaE;                               // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod.w.d)
 
   // regfile signals
   logic [P.FLEN-1:0]           FRD1D, FRD2D, FRD3D;                // Read Data from FP register - decode stage
@@ -170,7 +171,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   fctrl #(P) fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
               .IntDivE, .InstrD,
               .StallE, .StallM, .StallW, .FlushE, .FlushM, .FlushW, .FRM_REGW, .STATUS_FS, .FDivBusyE,
-              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .FrmM, .FmtE, .FmtM,
+              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .ZfaE, .FrmM, .FmtE, .FmtM,
               .FDivStartE, .IDivStartE, .FWriteIntE, .FCvtIntE, .FWriteIntM, .OpCtrlE, .OpCtrlM, .FpLoadStoreM,
               .IllegalFPUInstrD, .XEnD, .YEnD, .ZEnD, .XEnE, .YEnE, .ZEnE,
               .FResSelE, .FResSelM, .FResSelW, .FPUActiveE, .PostProcSelE, .PostProcSelM, .FCvtIntW, 
@@ -247,7 +248,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
     .UmM, .FIntDivResultM);
 
   // compare: fmin/fmax, flt/fle/feq
-  fcmp #(P) fcmp (.Fmt(FmtE), .OpCtrl(OpCtrlE), .Xs(XsE), .Ys(YsE), .Xe(XeE), .Ye(YeE), 
+  fcmp #(P) fcmp (.Fmt(FmtE), .OpCtrl(OpCtrlE), .Zfa(ZfaE), .Xs(XsE), .Ys(YsE), .Xe(XeE), .Ye(YeE), 
     .Xm(XmE), .Ym(YmE), .XZero(XZeroE), .YZero(YZeroE), .XNaN(XNaNE), .YNaN(YNaNE), 
     .XSNaN(XSNaNE), .YSNaN(YSNaNE), .X(XE), .Y(YE), .CmpNV(CmpNVE), 
     .CmpFpRes(CmpFpResE), .CmpIntRes(CmpIntResE));
diff --git a/testbench/tests.vh b/testbench/tests.vh
index 86f65eb14..43cbc91a0 100644
--- a/testbench/tests.vh
+++ b/testbench/tests.vh
@@ -1999,16 +1999,58 @@ string arch64zbs[] = '{
 
   string arch32zfaf[] = '{
     `RISCVARCHTEST,
-    "rv32i_m/F_Zfa/src/fle_b1-01.S",
-    "rv32i_m/F_Zfa/src/fle_b19-01.S",
-    "rv32i_m/F_Zfa/src/fli_b1-01.S",
+    "rv32i_m/F_Zfa/src/fleq_b1-01.S",
+    "rv32i_m/F_Zfa/src/fleq_b19-01.S", 
+    "rv32i_m/F_Zfa/src/fli.s-01.S",
     "rv32i_m/F_Zfa/src/fltq_b1-01.S",
     "rv32i_m/F_Zfa/src/fltq_b19-01.S",
-    "rv32i_m/F_Zfa/src/fmin_b1-01.S",
-    "rv32i_m/F_Zfa/src/fmin_b19-01.S",
-    "rv32i_m/F_Zfa/src/fmax_b1-01.S",
-    "rv32i_m/F_Zfa/src/fmax_b19-01.S",
-    "rv32i_m/F_Zfa/src/fround_b1-01.S"
+    "rv32i_m/F_Zfa/src/fminm_b1-01.S",
+    "rv32i_m/F_Zfa/src/fminm_b19-01.S",
+    "rv32i_m/F_Zfa/src/fmaxm_b1-01.S",
+    "rv32i_m/F_Zfa/src/fmaxm_b19-01.S"
+/*    "rv32i_m/F_Zfa/src/fround_b1-01.S" */
+  };
+
+  string arch32zfad[] = '{
+    `RISCVARCHTEST,
+    "rv32i_m/D_Zfa/src/fleq_b1-01.S",
+    "rv32i_m/D_Zfa/src/fleq_b19-01.S", 
+    "rv32i_m/D_Zfa/src/fli.d-01.S",
+    "rv32i_m/D_Zfa/src/fltq_b1-01.S",
+    "rv32i_m/D_Zfa/src/fltq_b19-01.S",
+    "rv32i_m/D_Zfa/src/fminm_b1-01.S",
+    "rv32i_m/D_Zfa/src/fminm_b19-01.S",
+    "rv32i_m/D_Zfa/src/fmaxm_b1-01.S",
+    "rv32i_m/D_Zfa/src/fmaxm_b19-01.S"
+/*    "rv32i_m/D_Zfa/src/fround_b1-01.S" */
+  };
+
+  string arch64zfaf[] = '{
+    `RISCVARCHTEST,
+    "rv64i_m/F_Zfa/src/fleq_b1-01.S",
+    "rv64i_m/F_Zfa/src/fleq_b19-01.S", 
+    "rv64i_m/F_Zfa/src/fli.s-01.S",
+    "rv64i_m/F_Zfa/src/fltq_b1-01.S",
+    "rv64i_m/F_Zfa/src/fltq_b19-01.S",
+    "rv64i_m/F_Zfa/src/fminm_b1-01.S",
+    "rv64i_m/F_Zfa/src/fminm_b19-01.S",
+    "rv64i_m/F_Zfa/src/fmaxm_b1-01.S",
+    "rv64i_m/F_Zfa/src/fmaxm_b19-01.S"
+/*    "rv64i_m/F_Zfa/src/fround_b1-01.S" */
+  };
+
+  string arch64zfad[] = '{
+    `RISCVARCHTEST,
+    "rv64i_m/D_Zfa/src/fleq_b1-01.S",
+    "rv64i_m/D_Zfa/src/fleq_b19-01.S", 
+    "rv64i_m/D_Zfa/src/fli.d-01.S",
+    "rv64i_m/D_Zfa/src/fltq_b1-01.S",
+    "rv64i_m/D_Zfa/src/fltq_b19-01.S",
+    "rv64i_m/D_Zfa/src/fminm_b1-01.S",
+    "rv64i_m/D_Zfa/src/fminm_b19-01.S",
+    "rv64i_m/D_Zfa/src/fmaxm_b1-01.S",
+    "rv64i_m/D_Zfa/src/fmaxm_b19-01.S"
+/*     "rv64i_m/D_Zfa/src/fround_b1-01.S" */
   };
 
   string arch32d_fma[] = '{

From 07e7e022415a66a9cc89c5c53b4f5fe72f71558d Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Tue, 16 Jan 2024 21:26:42 -0800
Subject: [PATCH 05/20] Coded Zfa fmvp but no tests exist

---
 src/fpu/fpu.sv     | 18 +++++++++++-------
 testbench/tests.vh | 12 +++++++++++-
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index bd387f5d7..430750c71 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -155,7 +155,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic [P.FLEN-1:0]           FResultW;                           // final FP result being written to the FP register   
 
   // other signals
-  logic [P.FLEN-1:0]           AlignedSrcAE;                       // align SrcA from IEU to the floating point format for fmv
+  logic [P.FLEN-1:0]           PreIntSrcE, IntSrcE;                // align SrcA from IEU to the floating point format for fmv / fmvp
   logic [P.FLEN-1:0]           BoxedZeroE;                         // Zero value for Z for multiplication, with NaN boxing if needed
   logic [P.FLEN-1:0]           BoxedOneE;                          // One value for Z for multiplication, with NaN boxing if needed
   logic                        StallUnpackedM;                     // Stall unpacker outputs during multicycle fdivsqrt
@@ -273,23 +273,27 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
     fli #(P) fli(.Rs1(Rs1E), .Fmt(FmtE), .Imm(FliResE)); 
   end else assign FliResE = '0;
 
-  // NaN Box SrcA to convert integer to requested FP size for fmv.*.x
-  if(P.FPSIZES == 1) assign AlignedSrcAE = {{P.FLEN-P.XLEN{1'b1}}, ForwardedSrcAE};
+  // fmv.*.x: NaN Box SrcA to extend integer to requested FP size 
+  if(P.FPSIZES == 1) assign PreIntSrcE = {{P.FLEN-P.XLEN{1'b1}}, ForwardedSrcAE};
   else if(P.FPSIZES == 2) 
-    mux2 #(P.FLEN) SrcAMux ({{P.FLEN-P.LEN1{1'b1}}, ForwardedSrcAE[P.LEN1-1:0]}, {{P.FLEN-P.XLEN{1'b1}}, ForwardedSrcAE}, FmtE, AlignedSrcAE);
+    mux2 #(P.FLEN) SrcAMux ({{P.FLEN-P.LEN1{1'b1}}, ForwardedSrcAE[P.LEN1-1:0]}, {{P.FLEN-P.XLEN{1'b1}}, ForwardedSrcAE}, FmtE, PreIntSrcE);
   else if(P.FPSIZES == 3 | P.FPSIZES == 4) begin
     localparam XD_LEN = P.D_LEN < P.XLEN ? P.D_LEN : P.XLEN; // shorter of D_LEN and XLEN
     mux3 #(P.FLEN) SrcAMux ({{P.FLEN-P.S_LEN{1'b1}}, ForwardedSrcAE[P.S_LEN-1:0]}, 
                             {{P.FLEN-XD_LEN{1'b1}}, ForwardedSrcAE[XD_LEN-1:0]}, 
                             {{P.FLEN-P.H_LEN{1'b1}}, ForwardedSrcAE[P.H_LEN-1:0]}, 
-                            FmtE, AlignedSrcAE); // NaN boxing zeroes
+                            FmtE, PreIntSrcE); // NaN boxing zeroes
   end
+  // fmvp.*.x: Select pair of registers
+  if (P.ZFA_SUPPORTED & (P.XLEN==32 & P.D_SUPPORTED) | (P.XLEN==64 & P.Q_SUPPORTED))
+       assign IntSrcE = ZfaE ? {ForwardedSrcBE, ForwardedSrcAE} : PreIntSrcE; // choose pair of integer registers for fmvp.d.x / fmvp.q.x
+  else assign IntSrcE = PreIntSrcE;
 
   // select a result that may be written to the FP register
-  mux4  #(P.FLEN) FResMux(SgnResE, AlignedSrcAE, CmpFpResE, FliResE, {OpCtrlE[2], &OpCtrlE[1:0]}, PreFpResE);
+  mux4  #(P.FLEN) FResMux(SgnResE, IntSrcE, CmpFpResE, FliResE, {OpCtrlE[2], &OpCtrlE[1:0]}, PreFpResE);
   assign PreNVE = CmpNVE&(OpCtrlE[2]|FWriteIntE);
 
-  // select the result that may be written to the integer register with fmv.x.*
+  // fmv.x.*: select the result that may be written to the integer register
   if(P.FPSIZES == 1) begin
     assign mvsgn = XE[P.FLEN-1];
     assign SgnExtXE = XE;
diff --git a/testbench/tests.vh b/testbench/tests.vh
index 43cbc91a0..0c60228dd 100644
--- a/testbench/tests.vh
+++ b/testbench/tests.vh
@@ -2004,6 +2004,8 @@ string arch64zbs[] = '{
     "rv32i_m/F_Zfa/src/fli.s-01.S",
     "rv32i_m/F_Zfa/src/fltq_b1-01.S",
     "rv32i_m/F_Zfa/src/fltq_b19-01.S",
+    "rv32i_m/D_Zfa/src/fltq_b1-01.S", // these D tests are more comprehensive and seem they should replace the F tests.  Applies to all F tests duplicated in D
+    "rv32i_m/D_Zfa/src/fltq_b19-01.S",
     "rv32i_m/F_Zfa/src/fminm_b1-01.S",
     "rv32i_m/F_Zfa/src/fminm_b19-01.S",
     "rv32i_m/F_Zfa/src/fmaxm_b1-01.S",
@@ -2015,13 +2017,21 @@ string arch64zbs[] = '{
     `RISCVARCHTEST,
     "rv32i_m/D_Zfa/src/fleq_b1-01.S",
     "rv32i_m/D_Zfa/src/fleq_b19-01.S", 
+    "rv32i_m/D_Zfa/src/fleq.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fleq.d_b19-01.S", 
     "rv32i_m/D_Zfa/src/fli.d-01.S",
     "rv32i_m/D_Zfa/src/fltq_b1-01.S",
     "rv32i_m/D_Zfa/src/fltq_b19-01.S",
+    "rv32i_m/D_Zfa/src/fltq.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fltq.d_b19-01.S",
     "rv32i_m/D_Zfa/src/fminm_b1-01.S",
     "rv32i_m/D_Zfa/src/fminm_b19-01.S",
+    "rv32i_m/D_Zfa/src/fminm.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fminm.d_b19-01.S",
     "rv32i_m/D_Zfa/src/fmaxm_b1-01.S",
-    "rv32i_m/D_Zfa/src/fmaxm_b19-01.S"
+    "rv32i_m/D_Zfa/src/fmaxm_b19-01.S",
+    "rv32i_m/D_Zfa/src/fmaxm.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fmaxm.d_b19-01.S"
 /*    "rv32i_m/D_Zfa/src/fround_b1-01.S" */
   };
 

From 4cfc86140c5c31740d5fd2a1272c59ebe5d36a53 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 17 Jan 2024 06:18:00 -0800
Subject: [PATCH 06/20] Zfa fmvh complete and passing tests:

---
 src/fpu/fctrl.sv   | 4 ++--
 src/fpu/fpu.sv     | 3 ++-
 testbench/tests.vh | 9 ++++++++-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/fpu/fctrl.sv b/src/fpu/fctrl.sv
index 9f60a692f..8dae34f89 100755
--- a/src/fpu/fctrl.sv
+++ b/src/fpu/fctrl.sv
@@ -148,10 +148,10 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                                   else if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
                                                 ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_0; // fmv.x.w/d/h/q  fp to int register
                                   else if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct7D[1:0] == 2'b01 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
-                                                  ControlsD = '0; // fmvh.x.d  (Zfa) *** needs values for all moves
+                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.d  (Zfa) 
                                   // coverage off    Q not supported in RV64GC
                                   else if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct7D[1:0] == 2'b11 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
-                                                  ControlsD = '0; // fmvh.x.q  (Zfa)
+                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.q  (Zfa)
                                   // coverage on
                       7'b11110??: if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
                                                 ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0_0; // fmv.w/d/h/q.x  int to fp reg
diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index 430750c71..85ea9dba6 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -310,7 +310,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
 
   // sign extend to XLEN if necessary
   if (P.FLEN>P.XLEN)
-    assign IntSrcXE = SgnExtXE[P.XLEN-1:0];
+    if (P.ZFA_SUPPORTED) assign IntSrcXE = ZfaE ? XE[P.FLEN-1:P.FLEN/2] : SgnExtXE[P.XLEN-1:0]; // either fmvh.x.* or fmv.x.*
+    else                 assign IntSrcXE = SgnExtXE[P.XLEN-1:0];
   else 
     assign IntSrcXE = {{P.XLEN-P.FLEN{mvsgn}}, SgnExtXE};
   mux3 #(P.XLEN) IntResMux (ClassResE, IntSrcXE, CmpIntResE, {~FResSelE[1], FResSelE[0]}, FIntResE);
diff --git a/testbench/tests.vh b/testbench/tests.vh
index 0c60228dd..7d2d320a8 100644
--- a/testbench/tests.vh
+++ b/testbench/tests.vh
@@ -2031,7 +2031,14 @@ string arch64zbs[] = '{
     "rv32i_m/D_Zfa/src/fmaxm_b1-01.S",
     "rv32i_m/D_Zfa/src/fmaxm_b19-01.S",
     "rv32i_m/D_Zfa/src/fmaxm.d_b1-01.S",
-    "rv32i_m/D_Zfa/src/fmaxm.d_b19-01.S"
+    "rv32i_m/D_Zfa/src/fmaxm.d_b19-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b22-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b23-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b24-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b27-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b28-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b29-01.S"
 /*    "rv32i_m/D_Zfa/src/fround_b1-01.S" */
   };
 

From 74b242ce5ca85b4f19587094ad4739c85372ed0a Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 17 Jan 2024 12:25:06 -0800
Subject: [PATCH 07/20] Partial implementation of fcvtmod.w.d; flags disagree
 in one case where Sail might be wrong, and result 134 is wrong because of
 overflow

---
 src/fpu/fctrl.sv                   | 15 ++++++++-------
 src/fpu/fpu.sv                     |  6 +++---
 src/fpu/postproc/postprocess.sv    |  5 +++--
 src/fpu/postproc/specialcase.sv    | 30 ++++++++++++++++++++++++++----
 testbench/common/instrNameDecTB.sv | 12 ++++++++++++
 testbench/tests.vh                 |  7 +++++++
 6 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/src/fpu/fctrl.sv b/src/fpu/fctrl.sv
index 8dae34f89..6d5a91aa6 100755
--- a/src/fpu/fctrl.sv
+++ b/src/fpu/fctrl.sv
@@ -54,7 +54,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   output logic [1:0]           PostProcSelE, PostProcSelM,         // select result in the post processing unit
   output logic [1:0]           FResSelE, FResSelM, FResSelW,       // Select one of the results that finish in the memory stage
   output logic                 FPUActiveE,                         // FP instruction being executed
-  output logic                 ZfaE,                               // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod)
+  output logic                 ZfaE, ZfaM,                         // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod)
   // register control signals
   output logic                 FRegWriteE, FRegWriteM, FRegWriteW, // FP register write enable
   output logic                 FWriteIntE, FWriteIntM,             // Write to integer register
@@ -149,7 +149,8 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                                                 ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_0; // fmv.x.w/d/h/q  fp to int register
                                   else if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct7D[1:0] == 2'b01 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
                                                   ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.d  (Zfa) 
-                                  // coverage off    Q not supported in RV64GC
+                                  //  Q not supported in RV64GC
+                                  // coverage off   
                                   else if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct7D[1:0] == 2'b11 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
                                                   ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.q  (Zfa)
                                   // coverage on
@@ -238,11 +239,11 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                                   endcase
                       // coverage on
                       7'b1011001: if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct3D == 3'b000) 
-                                                  ControlsD = '0; // fmvp.d.x  (Zfa)
+                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fmvp.d.x  (Zfa) *** untested, controls could be wrong
                       // Not covered in testing because rv64gc does not support quad precision
                       // coverage off
                       7'b1011011: if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct3D == 3'b000) 
-                                                  ControlsD = '0; // fmvp.q.x  (Zfa)
+                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fmvp.q.x  (Zfa)
                       // coverage on
                    endcase
       endcase
@@ -362,9 +363,9 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   else                               assign IDivStartE = 0; 
 
   // E/M pipleine register
-  flopenrc #(13+int'(P.FMTBITS)) EMCtrlReg (clk, reset, FlushM, ~StallM,
-              {FRegWriteE, FResSelE, PostProcSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE},
-              {FRegWriteM, FResSelM, PostProcSelM, FrmM, FmtM, OpCtrlM, FWriteIntM, FCvtIntM});
+  flopenrc #(14+int'(P.FMTBITS)) EMCtrlReg (clk, reset, FlushM, ~StallM,
+              {FRegWriteE, FResSelE, PostProcSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE},
+              {FRegWriteM, FResSelM, PostProcSelM, FrmM, FmtM, OpCtrlM, FWriteIntM, FCvtIntM, ZfaM});
   
   // renameing for readability
   assign FpLoadStoreM = FResSelM[1];
diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index 85ea9dba6..c304219aa 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -83,7 +83,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic                        XEnE, YEnE, ZEnE;                   // X, Y, Z inputs used for current operation
   logic                        FRegWriteE;                         // Write floating-point register
   logic                        FPUActiveE;                         // FP instruction being executed
-  logic                        ZfaE;                               // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod.w.d)
+  logic                        ZfaE, ZfaM;                         // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod.w.d)
 
   // regfile signals
   logic [P.FLEN-1:0]           FRD1D, FRD2D, FRD3D;                // Read Data from FP register - decode stage
@@ -171,7 +171,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   fctrl #(P) fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
               .IntDivE, .InstrD,
               .StallE, .StallM, .StallW, .FlushE, .FlushM, .FlushW, .FRM_REGW, .STATUS_FS, .FDivBusyE,
-              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .ZfaE, .FrmM, .FmtE, .FmtM,
+              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .ZfaE, .ZfaM, .FrmM, .FmtE, .FmtM,
               .FDivStartE, .IDivStartE, .FWriteIntE, .FCvtIntE, .FWriteIntM, .OpCtrlE, .OpCtrlM, .FpLoadStoreM,
               .IllegalFPUInstrD, .XEnD, .YEnD, .ZEnD, .XEnE, .YEnE, .ZEnE,
               .FResSelE, .FResSelM, .FResSelW, .FPUActiveE, .PostProcSelE, .PostProcSelM, .FCvtIntW, 
@@ -348,7 +348,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
     .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), 
     .FmaSm(SmM), .DivUe(UeM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
     .CvtCe(CeM), .CvtResSubnormUf(CvtResSubnormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CsM), 
-    .ToInt(FWriteIntM), .DivSticky(DivStickyM), .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), 
+    .ToInt(FWriteIntM), .Zfa(ZfaM), .DivSticky(DivStickyM), .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), 
     .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));
 
   // FPU flag selection - to privileged
diff --git a/src/fpu/postproc/postprocess.sv b/src/fpu/postproc/postprocess.sv
index c2de8644e..516752a78 100644
--- a/src/fpu/postproc/postprocess.sv
+++ b/src/fpu/postproc/postprocess.sv
@@ -56,6 +56,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   input logic                              CvtResSubnormUf,     // the convert result is subnormal or underflows
   input logic  [P.LOGCVTLEN-1:0]           CvtShiftAmt,         // how much to shift by
   input logic                              ToInt,               // is fp->int (since it's writting to the integer register)
+  input logic                              Zfa,                 // Zfa operation (fcvtmod.w.d)
   input logic  [P.CVTLEN-1:0]              CvtLzcIn,            // input to the Leading Zero Counter (without msb)
   input logic                              IntZero,             // is the integer input zero
   // final results
@@ -216,9 +217,9 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
 
   negateintres #(P) negateintres(.Xs, .Shifted, .Signed, .Int64, .Plus1, .CvtNegResMsbs, .CvtNegRes);
 
-  specialcase #(P) specialcase(.Xs, .Xm, .Ym, .Zm, .XZero, .IntInvalid,
+  specialcase #(P) specialcase(.Xs, .Xm, .Ym, .Zm, .XZero, .IntInvalid, 
       .IntZero, .Frm, .OutFmt, .XNaN, .YNaN, .ZNaN, .CvtResUf, 
-      .NaNIn, .IntToFp, .Int64, .Signed, .CvtOp, .FmaOp, .Plus1, .Invalid, .Overflow, .InfIn, .CvtNegRes,
+      .NaNIn, .IntToFp, .Int64, .Signed, .Zfa, .CvtOp, .FmaOp, .Plus1, .Invalid, .Overflow, .InfIn, .CvtNegRes,
       .XInf, .YInf, .DivOp, .DivByZero, .FullRe, .CvtCe, .Rs, .Re, .Rf, .PostProcRes, .FCvtIntRes);
 
 endmodule
diff --git a/src/fpu/postproc/specialcase.sv b/src/fpu/postproc/specialcase.sv
index 677ccce16..76784e4a1 100644
--- a/src/fpu/postproc/specialcase.sv
+++ b/src/fpu/postproc/specialcase.sv
@@ -53,6 +53,7 @@ module specialcase import cvw::*;  #(parameter cvw_t P) (
   input  logic                 IntToFp,           // is cvt int -> fp opperation
   input  logic                 Int64,             // is the integer 64 bits
   input  logic                 Signed,            // is the integer signed
+  input  logic                 Zfa,               // Zfa conversion operation: fcvtmod.w.d
   input  logic [P.NE:0]        CvtCe,             // the calculated expoent for cvt
   input  logic                 IntInvalid,        // integer invalid flag to choose the result
   input  logic                 CvtResUf,          // does the convert result underflow
@@ -70,10 +71,12 @@ module specialcase import cvw::*;  #(parameter cvw_t P) (
   logic [P.FLEN-1:0]   OfRes;      // overflowed result result
   logic [P.FLEN-1:0]   NormRes;    // normal result
   logic [P.XLEN-1:0]   OfIntRes;   // the overflow result for integer output
+  logic [P.XLEN-1:0]   OfIntRes2;  // the overflow result for integer output after accounting for fcvtmod.w.d
+  logic [P.XLEN-1:0]   Int64Res;   // Result for conversion to 64-bit int after accounting for fcvtmod.w.d
   logic                OfResMax;   // does the of result output maximum norm fp number
   logic                KillRes;    // kill the result for underflow
-  logic                SelOfRes;   // should the overflow result be selected
-
+  logic                SelOfRes;   // should the overflow result be selected (excluding convert)
+  logic                SelCvtOfRes; // select overflow result for convert instruction
 
   // does the overflow result output the maximum normalized floating point number
   //                output infinity if the input is infinity
@@ -329,6 +332,25 @@ module specialcase import cvw::*;  #(parameter cvw_t P) (
             else          OfIntRes = {P.XLEN{1'b1}}; // unsigned positive
     end  
    
+  // fcvtmod.w.d logic
+  // fcvtmod.w.d is like fcvt.w.d excep thtat it takes bits [31:0] and sign extends the rest,
+  // and converts +/-inf and NaN to zero.
+
+  if (P.ZFA_SUPPORTED & P.D_SUPPORTED) // fcvtmod.w.d support
+    always_comb begin
+        if (Zfa) OfIntRes2 = '0;                
+        else     OfIntRes2 = OfIntRes;
+        if (Zfa) Int64Res = {{(P.XLEN-32){CvtNegRes[P.XLEN-1]}}, CvtNegRes[31:0]};
+        else     Int64Res = CvtNegRes[P.XLEN-1:0];
+        if (Zfa) SelCvtOfRes = InfIn | NaNIn; // fcvtmod.w.d only overflows to 0 on NaN or Infinity
+        else     SelCvtOfRes = IntInvalid;    // regular fcvt gives an overflow if out of range
+    end
+  else 
+    always_comb begin // no fcvtmod.w.d support
+        OfIntRes2 = OfIntRes;
+        Int64Res = CvtNegRes[P.XLEN-1:0];
+        SelCvtOfRes = IntInvalid;
+    end
 
   // select the integer output
   //      - if the input is invalid (out of bounds NaN or Inf) then output overflow res
@@ -337,10 +359,10 @@ module specialcase import cvw::*;  #(parameter cvw_t P) (
   //          - otherwise output a rounded 0
   //      - otherwise output the normal res (trmined and sign extended if nessisary)
   always_comb
-    if(IntInvalid)          FCvtIntRes = OfIntRes;
+    if(SelCvtOfRes)         FCvtIntRes = OfIntRes2; 
     else if(CvtCe[P.NE]) 
       if(Xs&Signed&Plus1)   FCvtIntRes = {{P.XLEN{1'b1}}};
       else                  FCvtIntRes = {{P.XLEN-1{1'b0}}, Plus1};
-    else if(Int64)          FCvtIntRes = CvtNegRes[P.XLEN-1:0];
+    else if(Int64)          FCvtIntRes = Int64Res;
     else                    FCvtIntRes = {{P.XLEN-32{CvtNegRes[31]}}, CvtNegRes[31:0]};
 endmodule
diff --git a/testbench/common/instrNameDecTB.sv b/testbench/common/instrNameDecTB.sv
index a3b5ef58e..ee6cd6900 100644
--- a/testbench/common/instrNameDecTB.sv
+++ b/testbench/common/instrNameDecTB.sv
@@ -298,6 +298,18 @@ module instrNameDecTB(
                        else if (funct7[6:2] == 5'b11100 & funct3 == 3'b001) name = "FCLASS";
                        else if (funct7[6:2] == 5'b00100 & funct3 == 3'b010) name = "FSGNJX";
                        else if (funct7[6:2] == 5'b10100 & funct3 == 3'b010) name = "FEQ";
+                       else if (funct7[6:2] == 5'b11110 & funct3 == 3'b000 & rs2 == 5'b00001) name = "FLI";
+                       else if (funct7[6:2] == 5'b00101 & funct3 == 3'b010) name = "FMINM";
+                       else if (funct7[6:2] == 5'b00101 & funct3 == 3'b011) name = "FMAXM";
+                       else if (funct7[6:2] == 5'b01000 & rs2 == 5'b00100) name = "FROUND";
+                       else if (funct7[6:2] == 5'b01000 & rs2 == 5'b00101) name = "FROUNDNX";
+                       else if (funct7[6:2] == 5'b10100 & funct3 == 3'b100) name = "FLEQ";
+                       else if (funct7[6:2] == 5'b10100 & funct3 == 3'b101) name = "FLTQ";
+                       else if (funct7 == 7'b1110001 & funct3 == 3'b000 & rs2 == 5'b00001) name = "FMVH.X.D";
+                       else if (funct7 == 7'b1110011 & funct3 == 3'b000 & rs2 == 5'b00001) name = "FMVH.X.Q";
+                       else if (funct7 == 7'b1011001 & funct3 == 3'b000) name = "FMVP.D.X";
+                       else if (funct7 == 7'b1011011 & funct3 == 3'b000) name = "FMVP.Q.X";
+                       else if (funct7 == 7'b1100001 & funct3 == 3'b001 & rs2 == 5'b01000) name = "FCVTMOD.W.D";
                        else                              name = "ILLEGAL";
       10'b0000111_010: name = "FLW";
       10'b0100111_010: name = "FSW";
diff --git a/testbench/tests.vh b/testbench/tests.vh
index 7d2d320a8..fecf4ebc9 100644
--- a/testbench/tests.vh
+++ b/testbench/tests.vh
@@ -2015,6 +2015,13 @@ string arch64zbs[] = '{
 
   string arch32zfad[] = '{
     `RISCVARCHTEST,
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b22-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b23-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b24-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b27-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b28-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b29-01.S",
     "rv32i_m/D_Zfa/src/fleq_b1-01.S",
     "rv32i_m/D_Zfa/src/fleq_b19-01.S", 
     "rv32i_m/D_Zfa/src/fleq.d_b1-01.S",

From 8b60992e72c629add0b2d090b83ecf4711b5a3ab Mon Sep 17 00:00:00 2001
From: naichewa <nwhyteaguayo@g.hmc.edu>
Date: Wed, 17 Jan 2024 14:38:11 -0800
Subject: [PATCH 08/20] fixed SPI tests failing when no icache

---
 .../riscv-test-suite/rv32i_m/privilege/src/WALLY-spi-01.S        | 1 +
 .../riscv-test-suite/rv64i_m/privilege/src/WALLY-spi-01.S        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-spi-01.S b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-spi-01.S
index b9c82c92d..3d0abc6a0 100644
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-spi-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-spi-01.S
@@ -607,6 +607,7 @@ SETUP_PLIC
 .4byte delay1, 0x0000001, write32_test      # reset delay1 register
 .4byte cs_mode, 0x00000000, write32_test    # reset cs_mode
 .4byte tx_mark, 0x00000001, write32_test    # set transmit watermark to 1 (any entry turns mark off)
+.4byte sck_div, 0x00000100, write32_test    # lower SPI clock rate so read32_tests trigger at correct times
 #.4byte ie, 0x00000000, write32_test         # enable transmit interrupt
 .4byte ip, 0x00000001, read32_test          # tx watermark interupt should be pending
 .4byte 0x0, 0x00000000, readmip_test
diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-spi-01.S b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-spi-01.S
index 266b0e74f..11aebe333 100644
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-spi-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-spi-01.S
@@ -608,6 +608,7 @@ SETUP_PLIC
 
 .8byte delay1, 0x0000001, write32_test      # reset delay1 register
 .8byte cs_mode, 0x00000000, write32_test    # reset cs_mode
+.8byte sck_div, 0x00000100, write32_test    # lower SPI clock rate so reads are done at correct time when ICACHE not supported
 .8byte tx_mark, 0x00000001, write32_test    # set transmit watermark to 1 (any entry turns mark off)
 #.8byte ie, 0x00000000, write32_test         # enable transmit interrupt
 .8byte ip, 0x00000001, read32_test          # tx watermark interupt should be pending

From 911b400af2cfd4ce47e6d48a748d7983684ba184 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Thu, 18 Jan 2024 13:13:56 -0800
Subject: [PATCH 09/20] Fault on misaligned AMO

---
 src/mmu/mmu.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mmu/mmu.sv b/src/mmu/mmu.sv
index c26ee2a44..dffa7be76 100644
--- a/src/mmu/mmu.sv
+++ b/src/mmu/mmu.sv
@@ -140,7 +140,7 @@ module mmu import cvw::*;  #(parameter cvw_t P,
       2'b11:  DataMisalignedM = |VAdr[2:0];        // ld, sd, fld, fsd
     endcase 
   assign LoadMisalignedFaultM     = DataMisalignedM & ReadNoAmoAccessM & ~(P.ZICCLSM_SUPPORTED & Cacheable); 
-  assign StoreAmoMisalignedFaultM = DataMisalignedM & WriteAccessM & ~(P.ZICCLSM_SUPPORTED & Cacheable);
+  assign StoreAmoMisalignedFaultM = DataMisalignedM & WriteAccessM & (~(P.ZICCLSM_SUPPORTED & Cacheable) | ReadAccessM); // Misaligned AMO faults even if ZICCLSM supported
 
   // Specify which type of page fault is occurring
   assign InstrPageFaultF    = TLBPageFault & ExecuteAccessF;

From 12b2baff827707f6934e71b6f1623f423fe08e1d Mon Sep 17 00:00:00 2001
From: Jordan Carlin <jordanmcarlin@gmail.com>
Date: Thu, 18 Jan 2024 17:33:59 -0800
Subject: [PATCH 10/20] add coverage of sfence.inval.ir instruction and fix
 sret coverage

---
 tests/coverage/priv.S | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/coverage/priv.S b/tests/coverage/priv.S
index aa9c8b50b..dcf56e14a 100644
--- a/tests/coverage/priv.S
+++ b/tests/coverage/priv.S
@@ -297,6 +297,16 @@ sretdone:
 
     wfi
 
+
+
+    # Test uncovered privdec instructions
+    # exercise sfence.inval.ir instruction
+    .word 0x18100073
+
+    # exercise sret with rs1 not 0
+    .word 0x102F8073
+
+
     j done
 
 

From 82d9467eeaf10135548b491e4224b5d5e0b638ec Mon Sep 17 00:00:00 2001
From: Jordan Carlin <jordanmcarlin@gmail.com>
Date: Thu, 18 Jan 2024 19:29:16 -0800
Subject: [PATCH 11/20] Add coverage of FIOM in different privelege modes

---
 tests/coverage/csrwrites.S | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tests/coverage/csrwrites.S b/tests/coverage/csrwrites.S
index 63ee00c38..ce5639bd7 100644
--- a/tests/coverage/csrwrites.S
+++ b/tests/coverage/csrwrites.S
@@ -37,4 +37,31 @@ main:
     csrrw t1, menvcfg, t0
     csrrw t2, senvcfg, t0
 
+    # testing FIOM with different privelege modes
+    # setting environment config (to both 1 and 0) in each privelege mode
+    csrsi menvcfg, 1
+    li a0, 1
+    ecall               # enter supervisor mode
+
+    li a0, 0            
+    ecall               # enter user mode
+
+    li a0, 1
+    ecall               # enter supervisor mode
+
+    csrsi senvcfg, 1
+    li a0, 0
+    ecall               # enter user mode
+
+    li a0, 3
+    ecall               # enter machine mode
+    csrci menvcfg, 1
+
+    li a0, 1
+    ecall               # enter supervisor mode
+
+    li a0, 0
+    ecall               # enter user mode
+
+
     j done

From f06f681dbd9491fb876261e8d2cd96382ae6628f Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Thu, 18 Jan 2024 21:30:39 -0800
Subject: [PATCH 12/20] CoreMark displays StoreStalls

---
 benchmarks/coremark/Makefile                  | 22 ++-----------------
 .../coremark/riscv64-baremetal/syscalls.c     |  1 +
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/benchmarks/coremark/Makefile b/benchmarks/coremark/Makefile
index db8a6e1d6..6e466291e 100644
--- a/benchmarks/coremark/Makefile
+++ b/benchmarks/coremark/Makefile
@@ -11,8 +11,8 @@ sources=$(cmbase)/core_main.c $(cmbase)/core_list_join.c $(cmbase)/coremark.h  \
 	$(PORT_DIR)/core_portme.h $(PORT_DIR)/core_portme.c $(PORT_DIR)/core_portme.mak \
 	$(PORT_DIR)/crt.S $(PORT_DIR)/encoding.h $(PORT_DIR)/util.h $(PORT_DIR)/syscalls.c
 ABI := $(if $(findstring "64","$(XLEN)"),lp64,ilp32)
-ARCH := rv$(XLEN)im_zicsr_zba_zbb_zbc_zbs
-#ARCH := rv$(XLEN)gc_zba_zbb_zbc_zbs
+ARCH := rv$(XLEN)gc_zba_zbb_zbc
+#ARCH := rv$(XLEN)im_zicsr_zba_zbb_zbc
 #ARCH := rv$(XLEN)gc
 #ARCH := rv$(XLEN)imc_zicsr
 #ARCH := rv$(XLEN)im_zicsr
@@ -25,24 +25,6 @@ PORT_CFLAGS = -g -mabi=$(ABI) -march=$(ARCH) -static -falign-functions=16 \
 	-nostdlib -nostartfiles -ffreestanding -mstrict-align \
 	-DTOTAL_DATA_SIZE=2000 -DMAIN_HAS_NOARGC=1 -DPERFORMANCE_RUN=1 -DITERATIONS=10 -DXLEN=$(XLEN) 
 
-# Black Parrott
-#PORT_CFLAGS = -O2 -fno-common -funroll-loops -finline-functions --param max-inline-insns-auto=20 -falign-functions=4 -falign-jumps=4 -falign-loops=4 \
-	-DITERATIONS=10 -DPERFORMANCE_RUN=1
-#OPTIMIZE := -O2 -fno-common -funroll-loops -finline-functions --param max-inline-insns-auto=20 -falign-functions=4 -falign-jumps=4 -falign-loops=4
-#override CFLAGS += $(OPTIMIZE) -DFLAGS_STR=\""$(OPTIMIZE)"\"
-#override CFLAGS += -DITERATIONS=10 -DPERFORMANCE_RUN=1
-
-# try adding the new fields from muntjac coremark build
-#PORT_CFLAGS = -g -mabi=$(ABI) -march=$(ARCH) -static  -falign-functions=16 \
-	-fno-common -flto -funswitch-loops -mcmodel=medany \
-	-falign-functions=4 -falign-jumps=4 -falign-loops=4  \
-	-mbranch-cost=1 -DSKIP_DEFAULT_MEMSET -mtune=sifive-3-series -O3 -finline-functions --param max-inline-insns-auto=20 -falign-jumps=4 \
-	-fno-delete-null-pointer-checks -fno-rename-registers --param=loop-max-datarefs-for-datadeps=0 \
-	-funroll-all-loops --param=uninlined-function-insns=8 -fno-tree-vrp -fwrapv -fipa-pta \
-	-nostdlib -nostartfiles -ffreestanding -mstrict-align \
-	-DTOTAL_DATA_SIZE=2000 -DMAIN_HAS_NOARGC=1 -DPERFORMANCE_RUN=1 -DITERATIONS=10 -DXLEN=$(XLEN) 
-
-
 all: $(work_dir)/coremark.bare.riscv.elf.memfile
 
 run:
diff --git a/benchmarks/coremark/riscv64-baremetal/syscalls.c b/benchmarks/coremark/riscv64-baremetal/syscalls.c
index 29cd5f24a..25c47b797 100644
--- a/benchmarks/coremark/riscv64-baremetal/syscalls.c
+++ b/benchmarks/coremark/riscv64-baremetal/syscalls.c
@@ -177,6 +177,7 @@ void _init(int cid, int nc)
   counters[17] = read_csr(mhpmcounter17) - counters[17];
 
   ee_printf("Load Stalls %d\n", counters[11]);
+  ee_printf("Store Stalls %d\n", counters[12]);
   ee_printf("D-Cache Accesses %d\n", counters[13]);
   ee_printf("D-Cache Misses %d\n", counters[14]); 
   ee_printf("I-Cache Accesses %d\n", counters[16]);

From eb8ab3fae252211d1e600c9a641a04f1e71ab464 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Thu, 18 Jan 2024 21:30:59 -0800
Subject: [PATCH 13/20] EBU coverage exclusion

---
 sim/coverage-exclusions-rv64gc.do | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sim/coverage-exclusions-rv64gc.do b/sim/coverage-exclusions-rv64gc.do
index 76e18e30e..b9c20eead 100644
--- a/sim/coverage-exclusions-rv64gc.do
+++ b/sim/coverage-exclusions-rv64gc.do
@@ -253,3 +253,10 @@ coverage exclude -srcfile priorityonehot.sv
 # Excluding pmpadrdecs[0] coverage case for PAgePMPAdrIn being hardwired to 1
 coverage exclude -scope /dut/core/ifu/immu/immu/pmp/pmpchecker/pmp/pmpadrdecs[0] -linerange [GetLineNum ../src/mmu/pmpadrdec.sv "exclusion-tag: PAgePMPAdrIn"] -item e 1 -fecexprrow 1
 coverage exclude -scope /dut/core/lsu/dmmu/dmmu/pmp/pmpchecker/pmp/pmpadrdecs[0] -linerange [GetLineNum ../src/mmu/pmpadrdec.sv "exclusion-tag: PAgePMPAdrIn"] -item e 1 -fecexprrow 1
+
+####################
+# EBU
+####################
+
+# Exclude EBU Beat Counter because it is only idle when bus has multicycle latency, but rv64gc has single cycle latency
+coverage exclude -scope /core/ebu/ebu/ebufsmarb/BeatCounter

From 17c9be7695b6e7a65541c7e727c63d202c40bf65 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Thu, 18 Jan 2024 21:36:52 -0800
Subject: [PATCH 14/20] Cleanup typos, remove Zicond from riscof until it is
 working

---
 config/buildroot/config.vh               | 12 ++++++------
 src/fpu/postproc/flags.sv                |  2 +-
 src/fpu/postproc/postprocess.sv          |  2 +-
 testbench/testbench.sv                   |  4 ++--
 tests/riscof/spike/riscof_spike.py       |  4 ++++
 tests/riscof/spike/spike_rv32gc_isa.yaml |  3 ++-
 tests/riscof/spike/spike_rv64gc_isa.yaml |  3 ++-
 7 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/config/buildroot/config.vh b/config/buildroot/config.vh
index d36fcf6e3..de6e4800d 100644
--- a/config/buildroot/config.vh
+++ b/config/buildroot/config.vh
@@ -40,7 +40,7 @@ localparam ZIFENCEI_SUPPORTED = 1;
 localparam ZICNTR_SUPPORTED = 1;
 localparam ZIHPM_SUPPORTED = 1;
 localparam COUNTERS = 12'd32;
-localparam ZFH_SUPPORTED = 0;
+localparam ZFH_SUPPORTED = 1;
 localparam ZFA_SUPPORTED = 0;
 localparam SSTC_SUPPORTED = 1;
 localparam ZICBOM_SUPPORTED = 1;
@@ -57,7 +57,7 @@ localparam BUS_SUPPORTED = 1;
 localparam DCACHE_SUPPORTED = 1;
 localparam ICACHE_SUPPORTED = 1;
 localparam VIRTMEM_SUPPORTED = 1;
-localparam VECTORED_INTERRUPTS_SUPPORTED = 1 ;
+localparam VECTORED_INTERRUPTS_SUPPORTED = 1;
 localparam BIGENDIAN_SUPPORTED = 1;
 
 // TLB configuration.  Entries should be a power of 2
@@ -163,10 +163,10 @@ localparam RADIX = 32'h4;
 localparam DIVCOPIES = 32'h4;
 
 // bit manipulation
-localparam ZBA_SUPPORTED = 0;
-localparam ZBB_SUPPORTED = 0;
-localparam ZBC_SUPPORTED = 0;
-localparam ZBS_SUPPORTED = 0;
+localparam ZBA_SUPPORTED = 1;
+localparam ZBB_SUPPORTED = 1;
+localparam ZBC_SUPPORTED = 1;
+localparam ZBS_SUPPORTED = 1;
 
 // New compressed instructions
 localparam ZCB_SUPPORTED = 1;
diff --git a/src/fpu/postproc/flags.sv b/src/fpu/postproc/flags.sv
index 98ed0a34d..50d9bf229 100644
--- a/src/fpu/postproc/flags.sv
+++ b/src/fpu/postproc/flags.sv
@@ -70,7 +70,7 @@ module flags import cvw::*;  #(parameter cvw_t P) (
   logic                        DivInvalid;             // integer invalid flag
   logic                        Underflow;              // Underflow flag
   logic                        ResExpGteMax;           // is the result greater than or equal to the maximum floating point expoent
-  logic                        ShiftGtIntSz;           // is the shift greater than the the integer size (use Re to account for possible roundning "shift")
+  logic                        ShiftGtIntSz;           // is the shift greater than the the integer size (use Re to account for possible rounding "shift")
 
   ///////////////////////////////////////////////////////////////////////////////
   // Overflow
diff --git a/src/fpu/postproc/postprocess.sv b/src/fpu/postproc/postprocess.sv
index 516752a78..1d51fdf85 100644
--- a/src/fpu/postproc/postprocess.sv
+++ b/src/fpu/postproc/postprocess.sv
@@ -89,7 +89,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   logic [P.NE+1:0]             NormSumExp;           // exponent of the normalized sum not taking into account Subnormal or zero results
   logic                        FmaPreResultSubnorm;  // is the result subnormal - calculated before LZA corection
   logic [$clog2(3*P.NF+5)-1:0] FmaShiftAmt;          // normalization shift amount for fma
-  // division singals
+  // division signals
   logic [P.LOGNORMSHIFTSZ-1:0] DivShiftAmt;          // divsqrt shif amount
   logic [P.NORMSHIFTSZ-1:0]    DivShiftIn;           // divsqrt shift input
   logic [P.NE+1:0]             Ue;                   // divsqrt corrected exponent after corretion shift
diff --git a/testbench/testbench.sv b/testbench/testbench.sv
index 87b603288..b20c6a993 100644
--- a/testbench/testbench.sv
+++ b/testbench/testbench.sv
@@ -632,8 +632,8 @@ task automatic updateProgramAddrLabelArray;
     end
   end
 
-  if(ProgramAddrLabelArray["begin_signature"] == 0) $display("Couldn't find begin_signature in %s", ProgramLabelMapFile);
-  if(ProgramAddrLabelArray["sig_end_canary"] == 0) $display("Couldn't find sig_end_canary in %s", ProgramLabelMapFile);
+//  if(ProgramAddrLabelArray["begin_signature"] == 0) $display("Couldn't find begin_signature in %s", ProgramLabelMapFile);
+//  if(ProgramAddrLabelArray["sig_end_canary"] == 0) $display("Couldn't find sig_end_canary in %s", ProgramLabelMapFile);
 
   $fclose(ProgramLabelMapFP);
   $fclose(ProgramAddrMapFP);
diff --git a/tests/riscof/spike/riscof_spike.py b/tests/riscof/spike/riscof_spike.py
index 61b556932..5450f64df 100644
--- a/tests/riscof/spike/riscof_spike.py
+++ b/tests/riscof/spike/riscof_spike.py
@@ -115,6 +115,10 @@ class spike(pluginTemplate):
           self.isa += '_Zicond'
       if "Zicboz" in ispec["ISA"]:
           self.isa += '_Zicboz'
+      if "Zfa" in ispec["ISA"]:
+          self.isa += '_Zfa'
+      if "Zfh" in ispec["ISA"]:
+          self.isa += '_Zfh'
       if "Zca" in ispec["ISA"]:
           self.isa += '_Zca'
       if "Zcb" in ispec["ISA"]:
diff --git a/tests/riscof/spike/spike_rv32gc_isa.yaml b/tests/riscof/spike/spike_rv32gc_isa.yaml
index ae314fa76..7d97edb6a 100644
--- a/tests/riscof/spike/spike_rv32gc_isa.yaml
+++ b/tests/riscof/spike/spike_rv32gc_isa.yaml
@@ -1,6 +1,7 @@
 hart_ids: [0]
 hart0:
-  ISA: RV32IMAFDCZicsr_Zicond_Zifencei_Zba_Zbb_Zbc_Zbs
+  ISA: RV32IMAFDCZicsr_Zifencei_Zba_Zbb_Zbc_Zbs
+#  ISA: RV32IMAFDCZicsr_Zicond_Zifencei_Zfa_Zfh_Zba_Zbb_Zbc_Zbs
 #  ISA: RV32IMAFDCZicsr_Zicboz_Zifencei_Zca_Zba_Zbb_Zbc_Zbs # _Zbkb_Zcb
   physical_addr_sz: 32
   User_Spec_Version: '2.3'
diff --git a/tests/riscof/spike/spike_rv64gc_isa.yaml b/tests/riscof/spike/spike_rv64gc_isa.yaml
index df5e7cb2b..471fbbb13 100644
--- a/tests/riscof/spike/spike_rv64gc_isa.yaml
+++ b/tests/riscof/spike/spike_rv64gc_isa.yaml
@@ -2,7 +2,8 @@ hart_ids: [0]
 hart0:
 #  ISA: RV64IMAFDCSUZicsr_Zicboz_Zifencei_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
 #  ISA: RV64IMAFDCSUZicsr_Zifencei_Zca_Zcb_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
-  ISA: RV64IMAFDCSUZicsr_Zicond_Zifencei_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
+#  ISA: RV64IMAFDCSUZicsr_Zicond_Zifencei_Zfa_Zfh_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
+  ISA: RV64IMAFDCSUZicsr_Zifencei_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
   physical_addr_sz: 56
   User_Spec_Version: '2.3'
   supported_xlen: [64]

From 9614913e8f54285096d2c162b2cc9e130e914e9a Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Thu, 18 Jan 2024 22:10:20 -0800
Subject: [PATCH 15/20] Changed CoreMark maiefile to rv64im

---
 benchmarks/coremark/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/coremark/Makefile b/benchmarks/coremark/Makefile
index 6e466291e..a73dc6eea 100644
--- a/benchmarks/coremark/Makefile
+++ b/benchmarks/coremark/Makefile
@@ -11,8 +11,8 @@ sources=$(cmbase)/core_main.c $(cmbase)/core_list_join.c $(cmbase)/coremark.h  \
 	$(PORT_DIR)/core_portme.h $(PORT_DIR)/core_portme.c $(PORT_DIR)/core_portme.mak \
 	$(PORT_DIR)/crt.S $(PORT_DIR)/encoding.h $(PORT_DIR)/util.h $(PORT_DIR)/syscalls.c
 ABI := $(if $(findstring "64","$(XLEN)"),lp64,ilp32)
-ARCH := rv$(XLEN)gc_zba_zbb_zbc
-#ARCH := rv$(XLEN)im_zicsr_zba_zbb_zbc
+#ARCH := rv$(XLEN)gc_zba_zbb_zbc
+ARCH := rv$(XLEN)im_zicsr_zba_zbb_zbc
 #ARCH := rv$(XLEN)gc
 #ARCH := rv$(XLEN)imc_zicsr
 #ARCH := rv$(XLEN)im_zicsr

From 9260d3c424092d3cd660fb5f3055f600b6ee0f6e Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Thu, 18 Jan 2024 22:46:07 -0800
Subject: [PATCH 16/20] Add Zfh support to imperas.ic, use Zicond in riscof now
 that it is fixed in riscv-arch-test

---
 config/rv32gc/config.vh                  | 4 ++--
 sim/imperas.ic                           | 1 +
 tests/riscof/spike/spike_rv32gc_isa.yaml | 3 +--
 tests/riscof/spike/spike_rv64gc_isa.yaml | 3 +--
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/config/rv32gc/config.vh b/config/rv32gc/config.vh
index a59bb1ab3..4baef0075 100644
--- a/config/rv32gc/config.vh
+++ b/config/rv32gc/config.vh
@@ -41,8 +41,8 @@ localparam ZIFENCEI_SUPPORTED = 1;
 localparam COUNTERS = 12'd32;
 localparam ZICNTR_SUPPORTED = 1;
 localparam ZIHPM_SUPPORTED = 1;
-localparam ZFH_SUPPORTED = 0;
-localparam ZFA_SUPPORTED = 0;
+localparam ZFH_SUPPORTED = 1;
+localparam ZFA_SUPPORTED = 1;
 localparam SSTC_SUPPORTED = 1;
 localparam ZICBOM_SUPPORTED = 1;
 localparam ZICBOZ_SUPPORTED = 1;
diff --git a/sim/imperas.ic b/sim/imperas.ic
index f3c620b96..5de5935c6 100644
--- a/sim/imperas.ic
+++ b/sim/imperas.ic
@@ -20,6 +20,7 @@
 # More extensions
 --override cpu/Zcb=T
 --override cpu/Zicond=T
+--override cpu/Zfh=T
 
 # Cache block operations
 --override cpu/Zicbom=T
diff --git a/tests/riscof/spike/spike_rv32gc_isa.yaml b/tests/riscof/spike/spike_rv32gc_isa.yaml
index 7d97edb6a..c2c95fbf4 100644
--- a/tests/riscof/spike/spike_rv32gc_isa.yaml
+++ b/tests/riscof/spike/spike_rv32gc_isa.yaml
@@ -1,7 +1,6 @@
 hart_ids: [0]
 hart0:
-  ISA: RV32IMAFDCZicsr_Zifencei_Zba_Zbb_Zbc_Zbs
-#  ISA: RV32IMAFDCZicsr_Zicond_Zifencei_Zfa_Zfh_Zba_Zbb_Zbc_Zbs
+  ISA: RV32IMAFDCZicsr_Zicond_Zifencei_Zfa_Zfh_Zba_Zbb_Zbc_Zbs
 #  ISA: RV32IMAFDCZicsr_Zicboz_Zifencei_Zca_Zba_Zbb_Zbc_Zbs # _Zbkb_Zcb
   physical_addr_sz: 32
   User_Spec_Version: '2.3'
diff --git a/tests/riscof/spike/spike_rv64gc_isa.yaml b/tests/riscof/spike/spike_rv64gc_isa.yaml
index 471fbbb13..4374ad07c 100644
--- a/tests/riscof/spike/spike_rv64gc_isa.yaml
+++ b/tests/riscof/spike/spike_rv64gc_isa.yaml
@@ -2,8 +2,7 @@ hart_ids: [0]
 hart0:
 #  ISA: RV64IMAFDCSUZicsr_Zicboz_Zifencei_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
 #  ISA: RV64IMAFDCSUZicsr_Zifencei_Zca_Zcb_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
-#  ISA: RV64IMAFDCSUZicsr_Zicond_Zifencei_Zfa_Zfh_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
-  ISA: RV64IMAFDCSUZicsr_Zifencei_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
+  ISA: RV64IMAFDCSUZicsr_Zicond_Zifencei_Zfa_Zfh_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
   physical_addr_sz: 56
   User_Spec_Version: '2.3'
   supported_xlen: [64]

From 324180244178e85510cbecd953cfe181c20a9893 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Sun, 21 Jan 2024 08:25:17 -0800
Subject: [PATCH 17/20] fixed bug in CORRSHIFTSZ param

---
 config/shared/config-shared.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 86f9a0a9e..ba215785c 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -111,7 +111,7 @@ localparam LLEN = (($unsigned(FLEN)<$unsigned(XLEN)) ? ($unsigned(XLEN)) : ($uns
 localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
 localparam NORMSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVb + 1 +NF+1) > (3*NF+6) ? (DIVb + 1 +NF+1) : (3*NF+6)));
 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));
-localparam CORRSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVMINb+1+NF) > (3*NF+4) ? (DIVMINb+1+NF) : (3*NF+4)));
+localparam CORRSHIFTSZ = (((DIVMINb+1+NF) > (3*NF+4) ? (DIVMINb+1+NF) : (3*NF+4))); // max(DIVMINb+NF+1, 3*NF+4)
 
 
 // Disable spurious Verilator warnings

From 1459943a7537ff184ab00d27a476129d2c1d2c91 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Sun, 21 Jan 2024 10:08:48 -0800
Subject: [PATCH 18/20] more shiftcorrection bug fixes

---
 config/shared/config-shared.vh      | 2 +-
 src/fpu/postproc/shiftcorrection.sv | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index ba215785c..5dfb4b1ba 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -111,7 +111,7 @@ localparam LLEN = (($unsigned(FLEN)<$unsigned(XLEN)) ? ($unsigned(XLEN)) : ($uns
 localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
 localparam NORMSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVb + 1 +NF+1) > (3*NF+6) ? (DIVb + 1 +NF+1) : (3*NF+6)));
 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));
-localparam CORRSHIFTSZ = (((DIVMINb+1+NF) > (3*NF+4) ? (DIVMINb+1+NF) : (3*NF+4))); // max(DIVMINb+NF+1, 3*NF+4)
+localparam CORRSHIFTSZ = NORMSHIFTSZ-2;
 
 
 // Disable spurious Verilator warnings
diff --git a/src/fpu/postproc/shiftcorrection.sv b/src/fpu/postproc/shiftcorrection.sv
index f5860b42d..1da3556d8 100644
--- a/src/fpu/postproc/shiftcorrection.sv
+++ b/src/fpu/postproc/shiftcorrection.sv
@@ -44,7 +44,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
   output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
 );
 
-  logic [3*P.NF+3:0]               CorrSumShifted;         // the shifted sum after LZA correction
+  logic [P.CORRSHIFTSZ-1:0]        CorrSumShifted;         // the shifted sum after LZA correction
   logic [P.CORRSHIFTSZ-1:0]        CorrQm0, CorrQm1;       // portions of Shifted to select for CorrQmShifted
   logic [P.CORRSHIFTSZ-1:0]        CorrQmShifted;          // the shifted divsqrt result after one bit shift
   logic                            ResSubnorm;             // is the result Subnormal
@@ -68,7 +68,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
   
   // if the result of the divider was calculated to be subnormal, then the result was correctly normalized, so select the top shifted bits
   always_comb
-    if(FmaOp)                       Mf = {CorrSumShifted, {P.CORRSHIFTSZ-(3*P.NF+4){1'b0}}};
+    if(FmaOp)                       Mf = {CorrSumShifted};
     else if (DivOp&~DivResSubnorm)  Mf = CorrQmShifted;
     else                            Mf = Shifted[P.NORMSHIFTSZ-1:P.NORMSHIFTSZ-P.CORRSHIFTSZ];
     

From 4936496bb9852f08db33fe3904a7d258be73d0f7 Mon Sep 17 00:00:00 2001
From: Jordan Carlin <jordanmcarlin@gmail.com>
Date: Mon, 22 Jan 2024 08:58:31 -0800
Subject: [PATCH 19/20] fix sfence.inval.ir and sret coverage from previous PR

---
 tests/coverage/priv.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/coverage/priv.S b/tests/coverage/priv.S
index dcf56e14a..6b5260259 100644
--- a/tests/coverage/priv.S
+++ b/tests/coverage/priv.S
@@ -300,6 +300,8 @@ sretdone:
 
 
     # Test uncovered privdec instructions
+    li a0, 3
+    ecall
     # exercise sfence.inval.ir instruction
     .word 0x18100073
 

From 0c13e14bbf922db0306d9b72285620c565554a34 Mon Sep 17 00:00:00 2001
From: Jordan Carlin <jordanmcarlin@gmail.com>
Date: Mon, 22 Jan 2024 09:52:58 -0800
Subject: [PATCH 20/20] coverage improvements for mret when mpp = 3; update
 imperas config

---
 sim/imperas.ic        |  2 ++
 tests/coverage/priv.S | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sim/imperas.ic b/sim/imperas.ic
index 5de5935c6..b35166429 100644
--- a/sim/imperas.ic
+++ b/sim/imperas.ic
@@ -37,6 +37,8 @@
 # SV39 and SV48 supported
 --override cpu/Sv_modes=768
 
+--override cpu/Svinval=T
+
 
 #  clarify
 #--override refRoot/cpu/mtvec_sext=F
diff --git a/tests/coverage/priv.S b/tests/coverage/priv.S
index 6b5260259..39b3a8aeb 100644
--- a/tests/coverage/priv.S
+++ b/tests/coverage/priv.S
@@ -309,7 +309,20 @@ sretdone:
     .word 0x102F8073
 
 
-    j done
+    # cover mret when mpp = 3 and mprv = 1
+    li a0, 3
+    ecall               # enter machine mode
+    bseti t0, zero, 17
+    csrs mstatus, t0    # set MPRV
+    li t1, 0x00001800   
+    csrs mstatus, t1    # set MPP=3
+    la t1, finished
+    csrr t0, mepc       
+    csrw mepc, t1       # set mepc for mret to jump to
+    mret
+
+
+finished: j done