Merge branch 'main' of https://github.com/openhwgroup/cvw

2025-02-11 06:05:49 +00:00 · 2024-01-23 14:37:11 -06:00 · 2024-01-23 14:37:11 -06:00 · d5bbb5ea27
commit d5bbb5ea27
parent 4c2ba2b0b4 cb597d2c85
27 changed files with 607 additions and 166 deletions
--- a/benchmarks/coremark/Makefile
+++ b/benchmarks/coremark/Makefile
@ -11,8 +11,8 @@ sources=$(cmbase)/core_main.c $(cmbase)/core_list_join.c $(cmbase)/coremark.h  \
 	$(PORT_DIR)/core_portme.h $(PORT_DIR)/core_portme.c $(PORT_DIR)/core_portme.mak \
 	$(PORT_DIR)/crt.S $(PORT_DIR)/encoding.h $(PORT_DIR)/util.h $(PORT_DIR)/syscalls.c
 ABI := $(if $(findstring "64","$(XLEN)"),lp64,ilp32)
-ARCH := rv$(XLEN)im_zicsr_zba_zbb_zbc_zbs
-#ARCH := rv$(XLEN)gc_zba_zbb_zbc_zbs
+#ARCH := rv$(XLEN)gc_zba_zbb_zbc
+ARCH := rv$(XLEN)im_zicsr_zba_zbb_zbc
 #ARCH := rv$(XLEN)gc
 #ARCH := rv$(XLEN)imc_zicsr
 #ARCH := rv$(XLEN)im_zicsr
@ -25,24 +25,6 @@ PORT_CFLAGS = -g -mabi=$(ABI) -march=$(ARCH) -static -falign-functions=16 \
 	-nostdlib -nostartfiles -ffreestanding -mstrict-align \
 	-DTOTAL_DATA_SIZE=2000 -DMAIN_HAS_NOARGC=1 -DPERFORMANCE_RUN=1 -DITERATIONS=10 -DXLEN=$(XLEN) 

-# Black Parrott
-#PORT_CFLAGS = -O2 -fno-common -funroll-loops -finline-functions --param max-inline-insns-auto=20 -falign-functions=4 -falign-jumps=4 -falign-loops=4 \
-	-DITERATIONS=10 -DPERFORMANCE_RUN=1
-#OPTIMIZE := -O2 -fno-common -funroll-loops -finline-functions --param max-inline-insns-auto=20 -falign-functions=4 -falign-jumps=4 -falign-loops=4
-#override CFLAGS += $(OPTIMIZE) -DFLAGS_STR=\""$(OPTIMIZE)"\"
-#override CFLAGS += -DITERATIONS=10 -DPERFORMANCE_RUN=1
-
-# try adding the new fields from muntjac coremark build
-#PORT_CFLAGS = -g -mabi=$(ABI) -march=$(ARCH) -static  -falign-functions=16 \
-	-fno-common -flto -funswitch-loops -mcmodel=medany \
-	-falign-functions=4 -falign-jumps=4 -falign-loops=4  \
-	-mbranch-cost=1 -DSKIP_DEFAULT_MEMSET -mtune=sifive-3-series -O3 -finline-functions --param max-inline-insns-auto=20 -falign-jumps=4 \
-	-fno-delete-null-pointer-checks -fno-rename-registers --param=loop-max-datarefs-for-datadeps=0 \
-	-funroll-all-loops --param=uninlined-function-insns=8 -fno-tree-vrp -fwrapv -fipa-pta \
-	-nostdlib -nostartfiles -ffreestanding -mstrict-align \
-	-DTOTAL_DATA_SIZE=2000 -DMAIN_HAS_NOARGC=1 -DPERFORMANCE_RUN=1 -DITERATIONS=10 -DXLEN=$(XLEN) 
-
-
 all: $(work_dir)/coremark.bare.riscv.elf.memfile

 run:
--- a/benchmarks/coremark/riscv64-baremetal/syscalls.c
+++ b/benchmarks/coremark/riscv64-baremetal/syscalls.c
@ -177,6 +177,7 @@ void _init(int cid, int nc)
  counters[17] = read_csr(mhpmcounter17) - counters[17];

  ee_printf("Load Stalls %d\n", counters[11]);
+  ee_printf("Store Stalls %d\n", counters[12]);
  ee_printf("D-Cache Accesses %d\n", counters[13]);
  ee_printf("D-Cache Misses %d\n", counters[14]); 
  ee_printf("I-Cache Accesses %d\n", counters[16]);
--- a/config/buildroot/config.vh
+++ b/config/buildroot/config.vh
@ -40,7 +40,7 @@ localparam ZIFENCEI_SUPPORTED = 1;
 localparam ZICNTR_SUPPORTED = 1;
 localparam ZIHPM_SUPPORTED = 1;
 localparam COUNTERS = 12'd32;
-localparam ZFH_SUPPORTED = 0;
+localparam ZFH_SUPPORTED = 1;
 localparam ZFA_SUPPORTED = 0;
 localparam SSTC_SUPPORTED = 1;
 localparam ZICBOM_SUPPORTED = 1;
@ -57,7 +57,7 @@ localparam BUS_SUPPORTED = 1;
 localparam DCACHE_SUPPORTED = 1;
 localparam ICACHE_SUPPORTED = 1;
 localparam VIRTMEM_SUPPORTED = 1;
-localparam VECTORED_INTERRUPTS_SUPPORTED = 1 ;
+localparam VECTORED_INTERRUPTS_SUPPORTED = 1;
 localparam BIGENDIAN_SUPPORTED = 1;

 // TLB configuration.  Entries should be a power of 2
@ -163,10 +163,10 @@ localparam RADIX = 32'h4;
 localparam DIVCOPIES = 32'h4;

 // bit manipulation
-localparam ZBA_SUPPORTED = 0;
-localparam ZBB_SUPPORTED = 0;
-localparam ZBC_SUPPORTED = 0;
-localparam ZBS_SUPPORTED = 0;
+localparam ZBA_SUPPORTED = 1;
+localparam ZBB_SUPPORTED = 1;
+localparam ZBC_SUPPORTED = 1;
+localparam ZBS_SUPPORTED = 1;

 // New compressed instructions
 localparam ZCB_SUPPORTED = 1;
--- a/config/rv32gc/config.vh
+++ b/config/rv32gc/config.vh
@ -41,8 +41,8 @@ localparam ZIFENCEI_SUPPORTED = 1;
 localparam COUNTERS = 12'd32;
 localparam ZICNTR_SUPPORTED = 1;
 localparam ZIHPM_SUPPORTED = 1;
-localparam ZFH_SUPPORTED = 0;
-localparam ZFA_SUPPORTED = 0;
+localparam ZFH_SUPPORTED = 1;
+localparam ZFA_SUPPORTED = 1;
 localparam SSTC_SUPPORTED = 1;
 localparam ZICBOM_SUPPORTED = 1;
 localparam ZICBOZ_SUPPORTED = 1;
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@ -111,7 +111,7 @@ localparam LLEN = (($unsigned(FLEN)<$unsigned(XLEN)) ? ($unsigned(XLEN)) : ($uns
 localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
 localparam NORMSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVb + 1 +NF+1) > (3*NF+6) ? (DIVb + 1 +NF+1) : (3*NF+6)));
 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));
-localparam CORRSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVMINb+1+NF) > (3*NF+4) ? (DIVMINb+1+NF) : (3*NF+4)));
+localparam CORRSHIFTSZ = NORMSHIFTSZ-2;


 // Disable spurious Verilator warnings
--- a/setup.sh
+++ b/setup.sh
@ -16,8 +16,7 @@ echo \$WALLY set to ${WALLY}
 # Must edit these based on your local environment.  Ask your sysadmin.
 export MGLS_LICENSE_FILE=27002@zircon.eng.hmc.edu                   # Change this to your Siemens license server
 export SNPSLMD_LICENSE_FILE=27020@zircon.eng.hmc.edu                # Change this to your Synopsys license server
-export QUESTA_HOME=/cad/mentor/questa_sim-2022.4_2/questasim        # Change this for your path to Questa, excluding bin
-#export QUESTA_HOME=/cad/mentor/questa_sim-2022.4_3/questasim        # Change this for your path to Questa, excluding bin
+export QUESTA_HOME=/cad/mentor/questa_sim-2023.4/questasim        # Change this for your path to Questa, excluding bin
 export SNPS_HOME=/cad/synopsys/SYN                                  # Change this for your path to Design Compiler, excluding bin

 # Path to RISC-V Tools
--- a/sim/coverage-exclusions-rv64gc.do
+++ b/sim/coverage-exclusions-rv64gc.do
@ -253,3 +253,10 @@ coverage exclude -srcfile priorityonehot.sv
 # Excluding pmpadrdecs[0] coverage case for PAgePMPAdrIn being hardwired to 1
 coverage exclude -scope /dut/core/ifu/immu/immu/pmp/pmpchecker/pmp/pmpadrdecs[0] -linerange [GetLineNum ../src/mmu/pmpadrdec.sv "exclusion-tag: PAgePMPAdrIn"] -item e 1 -fecexprrow 1
 coverage exclude -scope /dut/core/lsu/dmmu/dmmu/pmp/pmpchecker/pmp/pmpadrdecs[0] -linerange [GetLineNum ../src/mmu/pmpadrdec.sv "exclusion-tag: PAgePMPAdrIn"] -item e 1 -fecexprrow 1
+
+####################
+# EBU
+####################
+
+# Exclude EBU Beat Counter because it is only idle when bus has multicycle latency, but rv64gc has single cycle latency
+coverage exclude -scope /core/ebu/ebu/ebufsmarb/BeatCounter
--- a/sim/imperas.ic
+++ b/sim/imperas.ic
@ -20,6 +20,7 @@
 # More extensions
 --override cpu/Zcb=T
 --override cpu/Zicond=T
+--override cpu/Zfh=T

 # Cache block operations
 --override cpu/Zicbom=T
@ -36,6 +37,8 @@
 # SV39 and SV48 supported
 --override cpu/Sv_modes=768

+--override cpu/Svinval=T
+

 #  clarify
 #--override refRoot/cpu/mtvec_sext=F
--- a/src/fpu/fcmp.sv
+++ b/src/fpu/fcmp.sv
@ -36,6 +36,7 @@
 module fcmp import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.FMTBITS-1:0]   Fmt,           // format of fp number
  input  logic [2:0]             OpCtrl,        // see above table
+  input  logic                   Zfa,           // Zfa variants: fminm, fmaxm, fleq, fltq
  input  logic                   Xs, Ys,        // input signs
  input  logic [P.NE-1:0]        Xe, Ye,        // input exponents
  input  logic [P.NF:0]          Xm, Ym,        // input mantissa
@ -70,8 +71,8 @@ module fcmp import cvw::*;  #(parameter cvw_t P) (
        3'b110: CmpNV = EitherSNaN; //min 
        3'b101: CmpNV = EitherSNaN; //max
        3'b010: CmpNV = EitherSNaN; //equal
-        3'b001: CmpNV = EitherNaN;  //less than
-        3'b011: CmpNV = EitherNaN;  //less than or equal
+        3'b001: CmpNV = Zfa ? EitherSNaN : EitherNaN;  // fltq / flt perform CompareQuietLess / CompareSignalingLess differing on when to set invalid
+        3'b011: CmpNV = Zfa ? EitherSNaN : EitherNaN;  // fleq / fle differ on when to set invalid
        default: CmpNV = 1'bx;
    endcase
  end 
@ -128,6 +129,12 @@ module fcmp import cvw::*;  #(parameter cvw_t P) (
  //    - if one is a NaN output the non-NaN
  always_comb
    if(OpCtrl[0]) // MAX
+        if (Zfa & P.ZFA_SUPPORTED) // fmaxm perform IEEE754 maxNum that produce NaN if either input is NaN
+          if (XNaN | YNaN) CmpFpRes = NaNRes; // either input is NaN
+          else
+            if (LT) CmpFpRes = Y; // X < Y
+            else    CmpFpRes = X; // X > Y
+        else // fmax performs IEEE754 maxNumber that produces NaN if both inputs are NaN
          if(XNaN)
            if(YNaN)    CmpFpRes = NaNRes;   // X = NaN Y = NaN
            else        CmpFpRes = Y;        // X = NaN Y != NaN
@ -137,6 +144,12 @@ module fcmp import cvw::*;  #(parameter cvw_t P) (
                if(LT)  CmpFpRes = Y;        // X < Y
                else    CmpFpRes = X;        // X > Y
    else  // MIN
+        if (Zfa & P.ZFA_SUPPORTED) // fminm perform IEEE754 minNum that produce NaN if either input is NaN
+          if (XNaN | YNaN) CmpFpRes = NaNRes; // either input is NaN
+          else
+            if (LT) CmpFpRes = X; // X < Y
+            else    CmpFpRes = Y; // X > Y
+        else // fmin performs IEEE754 minNumber that produces NaN if both inputs are NaN
          if(XNaN)
            if(YNaN)    CmpFpRes = NaNRes;   // X = NaN Y = NaN
            else        CmpFpRes = Y;        // X = NaN Y != NaN
--- a/src/fpu/fctrl.sv
+++ b/src/fpu/fctrl.sv
@ -54,6 +54,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  output logic [1:0]           PostProcSelE, PostProcSelM,         // select result in the post processing unit
  output logic [1:0]           FResSelE, FResSelM, FResSelW,       // Select one of the results that finish in the memory stage
  output logic                 FPUActiveE,                         // FP instruction being executed
+  output logic                 ZfaE, ZfaM,                         // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod)
  // register control signals
  output logic                 FRegWriteE, FRegWriteM, FRegWriteW, // FP register write enable
  output logic                 FWriteIntE, FWriteIntM,             // Write to integer register
@ -64,7 +65,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  output logic                 FDivStartE, IDivStartE              // Start division or squareroot
  );

-  `define FCTRLW 12
+  `define FCTRLW 13

  logic [`FCTRLW-1:0]          ControlsD;                          // control signals
  logic                        FRegWriteD;                         // FP register write enable
@ -79,6 +80,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  logic                        SupportedFmt;                       // is the format supported
  logic                        SupportedFmt2;                      // is the source format supported for fp -> fp
  logic                        FCvtIntD, FCvtIntM;                 // convert to integer opperation
+  logic                        ZfaD;                               // Zfa variants of instructions

  // FPU Instruction Decoder
  assign Fmt = Funct7D[1:0];
@ -91,127 +93,165 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                         (Fmt2 == 2'b10 & P.ZFH_SUPPORTED) | (Fmt2 == 2'b11 & P.Q_SUPPORTED));

  // decode the instruction                       
-  // FRegWrite_FWriteInt_FResSel_PostProcSel_FOpCtrl_FDivStart_IllegalFPUInstr_FCvtInt
+  // FRegWrite_FWriteInt_FResSel_PostProcSel_FOpCtrl_FDivStart_IllegalFPUInstr_FCvtInt_Zfa
  always_comb
    if (STATUS_FS == 2'b00) // FPU instructions are illegal when FPU is disabled
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0;
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0;
    else if (OpD != 7'b0000111 & OpD != 7'b0100111 & ~SupportedFmt) 
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0; // for anything other than loads and stores, check for supported format
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0; // for anything other than loads and stores, check for supported format
    else begin 
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0; // default: non-implemented instruction
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0; // default: non-implemented instruction
      /* verilator lint_off CASEINCOMPLETE */   // default value above has priority so no other default needed
      case(OpD)
        7'b0000111: case(Funct3D)
-                      3'b010:                       ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0; // flw
-                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0; // fld
-                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0; // flq
-                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0; // flh
+                      3'b010:                       ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flw
+                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // fld
+                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flq
+                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flh
                    endcase
        7'b0100111: case(Funct3D)
-                      3'b010:                       ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0; // fsw
-                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0; // fsd
-                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0; // fsq
-                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0; // fsh
+                      3'b010:                       ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsw
+                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsd
+                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsq
+                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsh
                    endcase
-        7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0_0; // fmadd
-        7'b1000111:   ControlsD = `FCTRLW'b1_0_01_10_001_0_0_0; // fmsub
-        7'b1001011:   ControlsD = `FCTRLW'b1_0_01_10_010_0_0_0; // fnmsub
-        7'b1001111:   ControlsD = `FCTRLW'b1_0_01_10_011_0_0_0; // fnmadd
+        7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0_0_0; // fmadd
+        7'b1000111:   ControlsD = `FCTRLW'b1_0_01_10_001_0_0_0_0; // fmsub
+        7'b1001011:   ControlsD = `FCTRLW'b1_0_01_10_010_0_0_0_0; // fnmsub
+        7'b1001111:   ControlsD = `FCTRLW'b1_0_01_10_011_0_0_0_0; // fnmadd
        7'b1010011: casez(Funct7D)
-                      7'b00000??: ControlsD = `FCTRLW'b1_0_01_10_110_0_0_0; // fadd
-                      7'b00001??: ControlsD = `FCTRLW'b1_0_01_10_111_0_0_0; // fsub
-                      7'b00010??: ControlsD = `FCTRLW'b1_0_01_10_100_0_0_0; // fmul
-                      7'b00011??: ControlsD = `FCTRLW'b1_0_01_01_xx0_1_0_0; // fdiv
-                      7'b01011??: if (Rs2D == 5'b0000) ControlsD = `FCTRLW'b1_0_01_01_xx1_1_0_0; // fsqrt
+                      7'b00000??: ControlsD = `FCTRLW'b1_0_01_10_110_0_0_0_0; // fadd
+                      7'b00001??: ControlsD = `FCTRLW'b1_0_01_10_111_0_0_0_0; // fsub
+                      7'b00010??: ControlsD = `FCTRLW'b1_0_01_10_100_0_0_0_0; // fmul
+                      7'b00011??: ControlsD = `FCTRLW'b1_0_01_01_xx0_1_0_0_0; // fdiv
+                      7'b01011??: if (Rs2D == 5'b0000) ControlsD = `FCTRLW'b1_0_01_01_xx1_1_0_0_0; // fsqrt
                      7'b00100??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_000_0_0_0; // fsgnj
-                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_001_0_0_0; // fsgnjn
-                                    3'b010:  ControlsD = `FCTRLW'b1_0_00_00_010_0_0_0; // fsgnjx
+                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_000_0_0_0_0; // fsgnj
+                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_001_0_0_0_0; // fsgnjn
+                                    3'b010:  ControlsD = `FCTRLW'b1_0_00_00_010_0_0_0_0; // fsgnjx
                                  endcase
                      7'b00101??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0; // fmin
-                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0; // fmax
+                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_0; // fmin
+                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_0; // fmax
+                                    3'b010:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_1; // fminm  (Zfa)
+                                    3'b011:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_1; // fmaxm  (Zfa)
                                  endcase
                      7'b10100??: case(Funct3D)
-                                    3'b010:  ControlsD = `FCTRLW'b0_1_00_00_010_0_0_0; // feq
-                                    3'b001:  ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0; // flt
-                                    3'b000:  ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0; // fle
+                                    3'b000:  ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_0; // fle
+                                    3'b001:  ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_0; // flt
+                                    3'b010:  ControlsD = `FCTRLW'b0_1_00_00_010_0_0_0_0; // feq
+                                    3'b100:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_1; // fleq  (Zfa)
+                                    3'b101:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_1; // fltq  (Zfa)
                                  endcase
                      7'b11100??: if (Funct3D == 3'b001 & Rs2D == 5'b00000)          
-                                                ControlsD = `FCTRLW'b0_1_10_00_000_0_0_0; // fclass
+                                                ControlsD = `FCTRLW'b0_1_10_00_000_0_0_0_0; // fclass
                                  else if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
-                                                ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0; // fmv.x.w/d/h/q  fp to int register
-                      7'b11110??: if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
-                                                ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0; // fmv.w/d/h/q.x  int to fp reg
-                      7'b0100000: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b00)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0; // fcvt.s.(d/q/h)
-                      7'b0100001: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b01)
-                                                ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0; // fcvt.d.(s/h/q)
+                                                ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_0; // fmv.x.w/d/h/q  fp to int register
+                                  else if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct7D[1:0] == 2'b01 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
+                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.d  (Zfa) 
+                                  //  Q not supported in RV64GC
                                  // coverage off   
-                      // Not covered in testing because rv64gc does not support half or quad precision
+                                  else if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct7D[1:0] == 2'b11 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
+                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.q  (Zfa)
+                                  // coverage on
+                      7'b11110??: if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
+                                                ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0_0; // fmv.w/d/h/q.x  int to fp reg
+                                  else if (P.ZFA_SUPPORTED & Funct3D == 3'b000 & Rs2D == 5'b00001) 
+                                                ControlsD = `FCTRLW'b1_0_00_00_111_0_0_0_1; // fli  (Zfa)
+                      7'b0100000: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b00)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_0; // fcvt.s.(d/q/h)
+                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.s  (Zfa) *** needs ctrl for all rounds
+                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.s  (Zfa) *** needs ctrl for all rounds
+                      7'b0100001: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b01)
+                                                ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0_0; // fcvt.d.(s/h/q)
+                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.d  (Zfa)
+                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.d  (Zfa)
                      7'b0100010: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b10)
-                                                ControlsD = `FCTRLW'b1_0_01_00_010_0_0_0; // fcvt.h.(s/d/q)
+                                                ControlsD = `FCTRLW'b1_0_01_00_010_0_0_0_0; // fcvt.h.(s/d/q)
+                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.h  (Zfa)
+                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.h  (Zfa)
+                      // coverage off
+                      // Not covered in testing because rv64gc does not support quad precision
                      7'b0100011: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b11)
-                                                ControlsD = `FCTRLW'b1_0_01_00_011_0_0_0; // fcvt.q.(s/h/d)
+                                                ControlsD = `FCTRLW'b1_0_01_00_011_0_0_0_0; // fcvt.q.(s/h/d)
+                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.q  (Zfa)
+                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.q  (Zfa)
                      // coverage on
                      7'b1101000: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.s.w   w->s
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.s.wu wu->s
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0; // fcvt.s.l   l->s
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0; // fcvt.s.lu lu->s
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.s.w   w->s
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.s.wu wu->s
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.s.l   l->s
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.s.lu lu->s
                                  endcase
                      7'b1100000: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1; // fcvt.w.s   s->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1; // fcvt.wu.s  s->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1; // fcvt.l.s   s->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.s  s->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.s   s->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.s  s->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.s   s->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.s  s->lu
                                  endcase
                      7'b1101001: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.d.w   w->d
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.d.wu wu->d
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0; // fcvt.d.l   l->d
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0; // fcvt.d.lu lu->d
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.d.w   w->d
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.d.wu wu->d
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.d.l   l->d
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.d.lu lu->d
                                  endcase
                      7'b1100001: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1; // fcvt.w.d   d->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1; // fcvt.wu.d  d->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1; // fcvt.l.d   d->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.d  d->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.d   d->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.d  d->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.d   d->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.d  d->lu
+                                    5'b01000: if (P.ZFA_SUPPORTED & P.D_SUPPORTED & Funct3D == 3'b001) 
+                                                 ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_1; // fcvtmod.w.d (Zfa)
                                  endcase
-                      // coverage off
-                      // Not covered in testing because rv64gc does not support half or quad precision
                      7'b1101010: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.h.w   w->h
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.h.wu wu->h
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0; // fcvt.h.l   l->h
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0; // fcvt.h.lu lu->h
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.h.w   w->h
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.h.wu wu->h
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.h.l   l->h
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.h.lu lu->h
                                  endcase
                      7'b1100010: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1; // fcvt.w.h   h->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1; // fcvt.wu.h  h->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1; // fcvt.l.h   h->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.h  h->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.h   h->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.h  h->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.h   h->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.h  h->lu
                                  endcase
+                      // Not covered in testing because rv64gc does not support quad precision
+                      // coverage off
                      7'b1101011: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.q.w   w->q
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.q.wu wu->q
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0; // fcvt.q.l   l->q
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0; // fcvt.q.lu lu->q
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.q.w   w->q
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.q.wu wu->q
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.q.l   l->q
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.q.lu lu->q
                                  endcase
                      7'b1100011: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1; // fcvt.w.q   q->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1; // fcvt.wu.q  q->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1; // fcvt.l.q   q->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1; // fcvt.lu.q  q->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.q   q->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.q  q->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.q   q->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.q  q->lu
                                  endcase
                      // coverage on
+                      7'b1011001: if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct3D == 3'b000) 
+                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fmvp.d.x  (Zfa) *** untested, controls could be wrong
+                      // Not covered in testing because rv64gc does not support quad precision
+                      // coverage off
+                      7'b1011011: if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct3D == 3'b000) 
+                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fmvp.q.x  (Zfa)
+                      // coverage on
                   endcase
      endcase
    end
    /* verilator lint_on CASEINCOMPLETE */

  // unswizzle control bits
-  assign #1 {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, FDivStartD, IllegalFPUInstrD, FCvtIntD} = ControlsD;
+  assign #1 {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, FDivStartD, IllegalFPUInstrD, FCvtIntD, ZfaD} = ControlsD;
  
  // rounding modes:
  //    000 - round to nearest, ties to even
@ -274,6 +314,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  //        011 - mv to fp        01
  //        110 - min             10
  //        101 - max             10
+  //        111 - fli             11

  //  OpCtrl:
  //    Fma: {not multiply-add?, negate prod?, negate Z?}
@ -310,9 +351,9 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  assign Adr3D = InstrD[31:27];
 
  // D/E pipleine register
-  flopenrc #(14+P.FMTBITS) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-              {FRegWriteD, PostProcSelD, FResSelD, FrmD, FmtD, OpCtrlD, FWriteIntD, FCvtIntD, ~IllegalFPUInstrD},
-              {FRegWriteE, PostProcSelE, FResSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, FPUActiveE});
+  flopenrc #(15+P.FMTBITS) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+              {FRegWriteD, PostProcSelD, FResSelD, FrmD, FmtD, OpCtrlD, FWriteIntD, FCvtIntD, ZfaD, ~IllegalFPUInstrD},
+              {FRegWriteE, PostProcSelE, FResSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE, FPUActiveE});
  flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {Adr1D, Adr2D, Adr3D}, {Adr1E, Adr2E, Adr3E});
  flopenrc #(1) DEFDivStartReg(clk, reset, FlushE, ~StallE|FDivBusyE, FDivStartD, FDivStartE);
  flopenrc #(3) DEEnReg(clk, reset, FlushE, ~StallE, {XEnD, YEnD, ZEnD}, {XEnE, YEnE, ZEnE});
@ -322,9 +363,9 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  else                               assign IDivStartE = 0; 

  // E/M pipleine register
-  flopenrc #(13+int'(P.FMTBITS)) EMCtrlReg (clk, reset, FlushM, ~StallM,
-              {FRegWriteE, FResSelE, PostProcSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE},
-              {FRegWriteM, FResSelM, PostProcSelM, FrmM, FmtM, OpCtrlM, FWriteIntM, FCvtIntM});
+  flopenrc #(14+int'(P.FMTBITS)) EMCtrlReg (clk, reset, FlushM, ~StallM,
+              {FRegWriteE, FResSelE, PostProcSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE},
+              {FRegWriteM, FResSelM, PostProcSelM, FrmM, FmtM, OpCtrlM, FWriteIntM, FCvtIntM, ZfaM});
  
  // renameing for readability
  assign FpLoadStoreM = FResSelM[1];
--- a/src/fpu/fli.sv
+++ b/src/fpu/fli.sv
@ -0,0 +1,219 @@
+///////////////////////////////////////////
+// fli.sv
+//
+// Written: David_Harris@hmc.edu
+// Modified: 1/16/2024
+//
+// Purpose: Floating-point float immediate
+// 
+// Documentation: RISC-V System on Chip Design Chapter 16
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module fli import cvw::*;  #(parameter cvw_t P) (
+  input  logic [4:0]        Rs1,           // Index of immediate to select
+  input  logic [1:0]        Fmt,           // 00 = single, 01 = double, 10 = half, 11 = quad
+  output logic [P.FLEN-1:0] Imm            // Immediate output
+);
+
+  logic [P.FLEN-1:0] HImmBox, SImmBox, DImmBox, QImmBox;
+
+  // select constant for each immediate size supported
+
+  ////////////////////////////
+  // half
+  ////////////////////////////
+  
+  if (P.ZFH_SUPPORTED) begin
+    logic [15:0] HImm;
+    always_comb begin
+        case(Rs1) 
+            0:  HImm = 16'hBC00;
+            1:  HImm = 16'h0400;
+            2:  HImm = 16'h0100;
+            3:  HImm = 16'h0200;
+            4:  HImm = 16'h1C00;
+            5:  HImm = 16'h2000;
+            6:  HImm = 16'h2C00;
+            7:  HImm = 16'h3000;
+            8:  HImm = 16'h3400;
+            9:  HImm = 16'h3500;
+            10: HImm = 16'h3600;
+            11: HImm = 16'h3700;
+            12: HImm = 16'h3800;
+            13: HImm = 16'h3900;
+            14: HImm = 16'h3A00;
+            15: HImm = 16'h3B00;
+            16: HImm = 16'h3C00;
+            17: HImm = 16'h3D00;
+            18: HImm = 16'h3E00;
+            19: HImm = 16'h3F00;
+            20: HImm = 16'h4000;
+            21: HImm = 16'h4100;
+            22: HImm = 16'h4200;
+            23: HImm = 16'h4400;
+            24: HImm = 16'h4800;
+            25: HImm = 16'h4C00;
+            26: HImm = 16'h5800;
+            27: HImm = 16'h5C00;
+            28: HImm = 16'h7800;
+            29: HImm = 16'h7C00;
+            30: HImm = 16'h7C00;
+            31: HImm = 16'h7E00;
+        endcase
+    end
+    assign HImmBox = {{(P.FLEN-16){1'b1}}, HImm}; // NaN-box HImm
+  end else assign HImmBox = '0;
+
+  ////////////////////////////
+  // single
+  ////////////////////////////
+
+    logic [31:0] SImm;
+     always_comb begin
+       case(Rs1) 
+            0:  SImm = 32'hBF800000;
+            1:  SImm = 32'h00800000;
+            2:  SImm = 32'h37800000;
+            3:  SImm = 32'h38000000;
+            4:  SImm = 32'h3B800000;
+            5:  SImm = 32'h3C000000;
+            6:  SImm = 32'h3D800000;
+            7:  SImm = 32'h3E000000;
+            8:  SImm = 32'h3E800000;
+            9:  SImm = 32'h3EA00000;
+            10: SImm = 32'h3EC00000;
+            11: SImm = 32'h3EE00000;
+            12: SImm = 32'h3F000000;
+            13: SImm = 32'h3F200000;
+            14: SImm = 32'h3F400000;
+            15: SImm = 32'h3F600000;
+            16: SImm = 32'h3F800000;
+            17: SImm = 32'h3FA00000;
+            18: SImm = 32'h3FC00000;
+            19: SImm = 32'h3FE00000;
+            20: SImm = 32'h40000000;
+            21: SImm = 32'h40200000;
+            22: SImm = 32'h40400000;
+            23: SImm = 32'h40800000;
+            24: SImm = 32'h41000000;
+            25: SImm = 32'h41800000;
+            26: SImm = 32'h43000000;
+            27: SImm = 32'h43800000;
+            28: SImm = 32'h47000000;
+            29: SImm = 32'h47800000;
+            30: SImm = 32'h7F800000;
+            31: SImm = 32'h7FC00000;
+        endcase
+    end
+    assign SImmBox = {{(P.FLEN-32){1'b1}}, SImm}; // NaN-box SImm
+
+  ////////////////////////////
+  // double
+  ////////////////////////////
+  
+  if (P.D_SUPPORTED) begin
+    logic [63:0] DImm;
+    always_comb begin
+        case(Rs1) 
+            0:  DImm = 64'hBFF0000000000000;
+            1:  DImm = 64'h0010000000000000;
+            2:  DImm = 64'h3EF0000000000000;
+            3:  DImm = 64'h3F00000000000000;
+            4:  DImm = 64'h3F70000000000000;
+            5:  DImm = 64'h3F80000000000000;
+            6:  DImm = 64'h3FB0000000000000;
+            7:  DImm = 64'h3FC0000000000000;
+            8:  DImm = 64'h3FD0000000000000;
+            9:  DImm = 64'h3FD4000000000000;
+            10: DImm = 64'h3FD8000000000000;
+            11: DImm = 64'h3FDC000000000000;
+            12: DImm = 64'h3FE0000000000000;
+            13: DImm = 64'h3FE4000000000000;
+            14: DImm = 64'h3FE8000000000000;
+            15: DImm = 64'h3FEC000000000000;
+            16: DImm = 64'h3FF0000000000000;
+            17: DImm = 64'h3FF4000000000000;
+            18: DImm = 64'h3FF8000000000000;
+            19: DImm = 64'h3FFC000000000000;
+            20: DImm = 64'h4000000000000000;
+            21: DImm = 64'h4004000000000000;
+            22: DImm = 64'h4008000000000000;
+            23: DImm = 64'h4010000000000000;
+            24: DImm = 64'h4020000000000000;
+            25: DImm = 64'h4030000000000000;
+            26: DImm = 64'h4060000000000000;
+            27: DImm = 64'h4070000000000000;
+            28: DImm = 64'h40E0000000000000;
+            29: DImm = 64'h40F0000000000000;
+            30: DImm = 64'h7FF0000000000000;
+            31: DImm = 64'h7FF8000000000000;
+        endcase
+    end
+    assign DImmBox = {{(P.FLEN-64){1'b1}}, DImm}; // NaN-box DImm
+  end else assign DImmBox = '0;
+  
+    ////////////////////////////
+  // double
+  ////////////////////////////
+  
+  if (P.Q_SUPPORTED) begin
+    logic [63:0] QImm;
+    always_comb begin
+        case(Rs1) 
+            0:  QImm = 128'hBFFF0000000000000000000000000000;
+            1:  QImm = 128'h00010000000000000000000000000000;
+            2:  QImm = 128'h3FEF0000000000000000000000000000;
+            3:  QImm = 128'h3FF00000000000000000000000000000;
+            4:  QImm = 128'h3FF70000000000000000000000000000;
+            5:  QImm = 128'h3FF80000000000000000000000000000;
+            6:  QImm = 128'h3FFB0000000000000000000000000000;
+            7:  QImm = 128'h3FFC0000000000000000000000000000;
+            8:  QImm = 128'h3FFD0000000000000000000000000000;
+            9:  QImm = 128'h3FFD4000000000000000000000000000;
+            10: QImm = 128'h3FFD8000000000000000000000000000;
+            11: QImm = 128'h3FFDC000000000000000000000000000;
+            12: QImm = 128'h3FFE0000000000000000000000000000;
+            13: QImm = 128'h3FFE4000000000000000000000000000;
+            14: QImm = 128'h3FFE8000000000000000000000000000;
+            15: QImm = 128'h3FFEC000000000000000000000000000;
+            16: QImm = 128'h3FFF0000000000000000000000000000;
+            17: QImm = 128'h3FFF4000000000000000000000000000;
+            18: QImm = 128'h3FFF8000000000000000000000000000;
+            19: QImm = 128'h3FFFC000000000000000000000000000;
+            20: QImm = 128'h40000000000000000000000000000000;
+            21: QImm = 128'h40004000000000000000000000000000;
+            22: QImm = 128'h40008000000000000000000000000000;
+            23: QImm = 128'h40010000000000000000000000000000;
+            24: QImm = 128'h40020000000000000000000000000000;
+            25: QImm = 128'h40030000000000000000000000000000;
+            26: QImm = 128'h40060000000000000000000000000000;
+            27: QImm = 128'h40070000000000000000000000000000;
+            28: QImm = 128'h400E0000000000000000000000000000;
+            29: QImm = 128'h400F0000000000000000000000000000;
+            30: QImm = 128'h7FFF0000000000000000000000000000;
+            31: QImm = 128'h7FFF8000000000000000000000000000;
+        endcase
+    end
+    assign QImmBox = QImm; // NaN-box QImm trivial because Q is longest format
+  end else assign QImmBox = '0;
+
+  mux4 #(P.FLEN) flimux(SImmBox, DImmBox, HImmBox, QImmBox, Fmt, Imm); // select immediate based on format
+
+endmodule
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@ -83,6 +83,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  logic                        XEnE, YEnE, ZEnE;                   // X, Y, Z inputs used for current operation
  logic                        FRegWriteE;                         // Write floating-point register
  logic                        FPUActiveE;                         // FP instruction being executed
+  logic                        ZfaE, ZfaM;                         // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod.w.d)

  // regfile signals
  logic [P.FLEN-1:0]           FRD1D, FRD2D, FRD3D;                // Read Data from FP register - decode stage
@ -154,12 +155,13 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  logic [P.FLEN-1:0]           FResultW;                           // final FP result being written to the FP register   

  // other signals
-  logic [P.FLEN-1:0]           AlignedSrcAE;                       // align SrcA from IEU to the floating point format for fmv
+  logic [P.FLEN-1:0]           PreIntSrcE, IntSrcE;                // align SrcA from IEU to the floating point format for fmv / fmvp
  logic [P.FLEN-1:0]           BoxedZeroE;                         // Zero value for Z for multiplication, with NaN boxing if needed
  logic [P.FLEN-1:0]           BoxedOneE;                          // One value for Z for multiplication, with NaN boxing if needed
  logic                        StallUnpackedM;                     // Stall unpacker outputs during multicycle fdivsqrt
  logic [P.FLEN-1:0]           SgnExtXE;                           // Sign-extended X input for move to integer
  logic                        mvsgn;                              // sign bit for extending move
+  logic [P.FLEN-1:0]           FliResE;                            // Floating-point load immediate value

  //////////////////////////////////////////////////////////////////////////////////////////
  // Decode Stage: fctrl decoder, read register file
@ -169,7 +171,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  fctrl #(P) fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
              .IntDivE, .InstrD,
              .StallE, .StallM, .StallW, .FlushE, .FlushM, .FlushW, .FRM_REGW, .STATUS_FS, .FDivBusyE,
-              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .FrmM, .FmtE, .FmtM,
+              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .ZfaE, .ZfaM, .FrmM, .FmtE, .FmtM,
              .FDivStartE, .IDivStartE, .FWriteIntE, .FCvtIntE, .FWriteIntM, .OpCtrlE, .OpCtrlM, .FpLoadStoreM,
              .IllegalFPUInstrD, .XEnD, .YEnD, .ZEnD, .XEnE, .YEnE, .ZEnE,
              .FResSelE, .FResSelM, .FResSelW, .FPUActiveE, .PostProcSelE, .PostProcSelM, .FCvtIntW, 
@ -246,7 +248,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
    .UmM, .FIntDivResultM);

  // compare: fmin/fmax, flt/fle/feq
-  fcmp #(P) fcmp (.Fmt(FmtE), .OpCtrl(OpCtrlE), .Xs(XsE), .Ys(YsE), .Xe(XeE), .Ye(YeE), 
+  fcmp #(P) fcmp (.Fmt(FmtE), .OpCtrl(OpCtrlE), .Zfa(ZfaE), .Xs(XsE), .Ys(YsE), .Xe(XeE), .Ye(YeE), 
    .Xm(XmE), .Ym(YmE), .XZero(XZeroE), .YZero(YZeroE), .XNaN(XNaNE), .YNaN(YNaNE), 
    .XSNaN(XSNaNE), .YSNaN(YSNaNE), .X(XE), .Y(YE), .CmpNV(CmpNVE), 
    .CmpFpRes(CmpFpResE), .CmpIntRes(CmpIntResE));
@ -263,23 +265,35 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
    .ToInt(FWriteIntE), .XZero(XZeroE), .Fmt(FmtE), .Ce(CeE), .ShiftAmt(CvtShiftAmtE), 
    .ResSubnormUf(CvtResSubnormUfE), .Cs(CsE), .IntZero(IntZeroE), .LzcIn(CvtLzcInE));

-  // NaN Box SrcA to convert integer to requested FP size for fmv.*.x
-  if(P.FPSIZES == 1) assign AlignedSrcAE = {{P.FLEN-P.XLEN{1'b1}}, ForwardedSrcAE};
+  // floating-point load immediate: fli
+  if (P.ZFA_SUPPORTED) begin
+    logic [4:0] Rs1E;
+    
+    flopenrc #(5) Rs1EReg(clk, reset, FlushE, ~StallE, InstrD[19:15], Rs1E);
+    fli #(P) fli(.Rs1(Rs1E), .Fmt(FmtE), .Imm(FliResE)); 
+  end else assign FliResE = '0;
+
+  // fmv.*.x: NaN Box SrcA to extend integer to requested FP size 
+  if(P.FPSIZES == 1) assign PreIntSrcE = {{P.FLEN-P.XLEN{1'b1}}, ForwardedSrcAE};
  else if(P.FPSIZES == 2) 
-    mux2 #(P.FLEN) SrcAMux ({{P.FLEN-P.LEN1{1'b1}}, ForwardedSrcAE[P.LEN1-1:0]}, {{P.FLEN-P.XLEN{1'b1}}, ForwardedSrcAE}, FmtE, AlignedSrcAE);
+    mux2 #(P.FLEN) SrcAMux ({{P.FLEN-P.LEN1{1'b1}}, ForwardedSrcAE[P.LEN1-1:0]}, {{P.FLEN-P.XLEN{1'b1}}, ForwardedSrcAE}, FmtE, PreIntSrcE);
  else if(P.FPSIZES == 3 | P.FPSIZES == 4) begin
    localparam XD_LEN = P.D_LEN < P.XLEN ? P.D_LEN : P.XLEN; // shorter of D_LEN and XLEN
    mux3 #(P.FLEN) SrcAMux ({{P.FLEN-P.S_LEN{1'b1}}, ForwardedSrcAE[P.S_LEN-1:0]}, 
                            {{P.FLEN-XD_LEN{1'b1}}, ForwardedSrcAE[XD_LEN-1:0]}, 
                            {{P.FLEN-P.H_LEN{1'b1}}, ForwardedSrcAE[P.H_LEN-1:0]}, 
-                            FmtE, AlignedSrcAE); // NaN boxing zeroes
+                            FmtE, PreIntSrcE); // NaN boxing zeroes
  end
+  // fmvp.*.x: Select pair of registers
+  if (P.ZFA_SUPPORTED & (P.XLEN==32 & P.D_SUPPORTED) | (P.XLEN==64 & P.Q_SUPPORTED))
+       assign IntSrcE = ZfaE ? {ForwardedSrcBE, ForwardedSrcAE} : PreIntSrcE; // choose pair of integer registers for fmvp.d.x / fmvp.q.x
+  else assign IntSrcE = PreIntSrcE;

  // select a result that may be written to the FP register
-  mux3  #(P.FLEN) FResMux(SgnResE, AlignedSrcAE, CmpFpResE, {OpCtrlE[2], &OpCtrlE[1:0]}, PreFpResE);
+  mux4  #(P.FLEN) FResMux(SgnResE, IntSrcE, CmpFpResE, FliResE, {OpCtrlE[2], &OpCtrlE[1:0]}, PreFpResE);
  assign PreNVE = CmpNVE&(OpCtrlE[2]|FWriteIntE);

-  // select the result that may be written to the integer register with fmv.x.*
+  // fmv.x.*: select the result that may be written to the integer register
  if(P.FPSIZES == 1) begin
    assign mvsgn = XE[P.FLEN-1];
    assign SgnExtXE = XE;
@ -296,7 +310,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (

  // sign extend to XLEN if necessary
  if (P.FLEN>P.XLEN)
-    assign IntSrcXE = SgnExtXE[P.XLEN-1:0];
+    if (P.ZFA_SUPPORTED) assign IntSrcXE = ZfaE ? XE[P.FLEN-1:P.FLEN/2] : SgnExtXE[P.XLEN-1:0]; // either fmvh.x.* or fmv.x.*
+    else                 assign IntSrcXE = SgnExtXE[P.XLEN-1:0];
  else 
    assign IntSrcXE = {{P.XLEN-P.FLEN{mvsgn}}, SgnExtXE};
  mux3 #(P.XLEN) IntResMux (ClassResE, IntSrcXE, CmpIntResE, {~FResSelE[1], FResSelE[0]}, FIntResE);
@ -333,7 +348,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
    .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), 
    .FmaSm(SmM), .DivUe(UeM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
    .CvtCe(CeM), .CvtResSubnormUf(CvtResSubnormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CsM), 
-    .ToInt(FWriteIntM), .DivSticky(DivStickyM), .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), 
+    .ToInt(FWriteIntM), .Zfa(ZfaM), .DivSticky(DivStickyM), .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), 
    .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));

  // FPU flag selection - to privileged
--- a/src/fpu/postproc/flags.sv
+++ b/src/fpu/postproc/flags.sv
@ -70,7 +70,7 @@ module flags import cvw::*;  #(parameter cvw_t P) (
  logic                        DivInvalid;             // integer invalid flag
  logic                        Underflow;              // Underflow flag
  logic                        ResExpGteMax;           // is the result greater than or equal to the maximum floating point expoent
-  logic                        ShiftGtIntSz;           // is the shift greater than the the integer size (use Re to account for possible roundning "shift")
+  logic                        ShiftGtIntSz;           // is the shift greater than the the integer size (use Re to account for possible rounding "shift")

  ///////////////////////////////////////////////////////////////////////////////
  // Overflow
--- a/src/fpu/postproc/postprocess.sv
+++ b/src/fpu/postproc/postprocess.sv
@ -56,6 +56,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
  input logic                              CvtResSubnormUf,     // the convert result is subnormal or underflows
  input logic  [P.LOGCVTLEN-1:0]           CvtShiftAmt,         // how much to shift by
  input logic                              ToInt,               // is fp->int (since it's writting to the integer register)
+  input logic                              Zfa,                 // Zfa operation (fcvtmod.w.d)
  input logic  [P.CVTLEN-1:0]              CvtLzcIn,            // input to the Leading Zero Counter (without msb)
  input logic                              IntZero,             // is the integer input zero
  // final results
@ -88,7 +89,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
  logic [P.NE+1:0]             NormSumExp;           // exponent of the normalized sum not taking into account Subnormal or zero results
  logic                        FmaPreResultSubnorm;  // is the result subnormal - calculated before LZA corection
  logic [$clog2(3*P.NF+5)-1:0] FmaShiftAmt;          // normalization shift amount for fma
-  // division singals
+  // division signals
  logic [P.LOGNORMSHIFTSZ-1:0] DivShiftAmt;          // divsqrt shif amount
  logic [P.NORMSHIFTSZ-1:0]    DivShiftIn;           // divsqrt shift input
  logic [P.NE+1:0]             Ue;                   // divsqrt corrected exponent after corretion shift
@ -218,7 +219,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (

  specialcase #(P) specialcase(.Xs, .Xm, .Ym, .Zm, .XZero, .IntInvalid, 
      .IntZero, .Frm, .OutFmt, .XNaN, .YNaN, .ZNaN, .CvtResUf, 
-      .NaNIn, .IntToFp, .Int64, .Signed, .CvtOp, .FmaOp, .Plus1, .Invalid, .Overflow, .InfIn, .CvtNegRes,
+      .NaNIn, .IntToFp, .Int64, .Signed, .Zfa, .CvtOp, .FmaOp, .Plus1, .Invalid, .Overflow, .InfIn, .CvtNegRes,
      .XInf, .YInf, .DivOp, .DivByZero, .FullRe, .CvtCe, .Rs, .Re, .Rf, .PostProcRes, .FCvtIntRes);

 endmodule
--- a/src/fpu/postproc/shiftcorrection.sv
+++ b/src/fpu/postproc/shiftcorrection.sv
@ -44,7 +44,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
 );

-  logic [3*P.NF+3:0]               CorrSumShifted;         // the shifted sum after LZA correction
+  logic [P.CORRSHIFTSZ-1:0]        CorrSumShifted;         // the shifted sum after LZA correction
  logic [P.CORRSHIFTSZ-1:0]        CorrQm0, CorrQm1;       // portions of Shifted to select for CorrQmShifted
  logic [P.CORRSHIFTSZ-1:0]        CorrQmShifted;          // the shifted divsqrt result after one bit shift
  logic                            ResSubnorm;             // is the result Subnormal
@ -68,7 +68,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  
  // if the result of the divider was calculated to be subnormal, then the result was correctly normalized, so select the top shifted bits
  always_comb
-    if(FmaOp)                       Mf = {CorrSumShifted, {P.CORRSHIFTSZ-(3*P.NF+4){1'b0}}};
+    if(FmaOp)                       Mf = {CorrSumShifted};
    else if (DivOp&~DivResSubnorm)  Mf = CorrQmShifted;
    else                            Mf = Shifted[P.NORMSHIFTSZ-1:P.NORMSHIFTSZ-P.CORRSHIFTSZ];
    
--- a/src/fpu/postproc/specialcase.sv
+++ b/src/fpu/postproc/specialcase.sv
@ -53,6 +53,7 @@ module specialcase import cvw::*;  #(parameter cvw_t P) (
  input  logic                 IntToFp,           // is cvt int -> fp opperation
  input  logic                 Int64,             // is the integer 64 bits
  input  logic                 Signed,            // is the integer signed
+  input  logic                 Zfa,               // Zfa conversion operation: fcvtmod.w.d
  input  logic [P.NE:0]        CvtCe,             // the calculated expoent for cvt
  input  logic                 IntInvalid,        // integer invalid flag to choose the result
  input  logic                 CvtResUf,          // does the convert result underflow
@ -70,10 +71,12 @@ module specialcase import cvw::*;  #(parameter cvw_t P) (
  logic [P.FLEN-1:0]   OfRes;      // overflowed result result
  logic [P.FLEN-1:0]   NormRes;    // normal result
  logic [P.XLEN-1:0]   OfIntRes;   // the overflow result for integer output
+  logic [P.XLEN-1:0]   OfIntRes2;  // the overflow result for integer output after accounting for fcvtmod.w.d
+  logic [P.XLEN-1:0]   Int64Res;   // Result for conversion to 64-bit int after accounting for fcvtmod.w.d
  logic                OfResMax;   // does the of result output maximum norm fp number
  logic                KillRes;    // kill the result for underflow
-  logic                SelOfRes;   // should the overflow result be selected
-
+  logic                SelOfRes;   // should the overflow result be selected (excluding convert)
+  logic                SelCvtOfRes; // select overflow result for convert instruction

  // does the overflow result output the maximum normalized floating point number
  //                output infinity if the input is infinity
@ -329,6 +332,25 @@ module specialcase import cvw::*;  #(parameter cvw_t P) (
            else          OfIntRes = {P.XLEN{1'b1}}; // unsigned positive
    end  
   
+  // fcvtmod.w.d logic
+  // fcvtmod.w.d is like fcvt.w.d excep thtat it takes bits [31:0] and sign extends the rest,
+  // and converts +/-inf and NaN to zero.
+
+  if (P.ZFA_SUPPORTED & P.D_SUPPORTED) // fcvtmod.w.d support
+    always_comb begin
+        if (Zfa) OfIntRes2 = '0;                
+        else     OfIntRes2 = OfIntRes;
+        if (Zfa) Int64Res = {{(P.XLEN-32){CvtNegRes[P.XLEN-1]}}, CvtNegRes[31:0]};
+        else     Int64Res = CvtNegRes[P.XLEN-1:0];
+        if (Zfa) SelCvtOfRes = InfIn | NaNIn; // fcvtmod.w.d only overflows to 0 on NaN or Infinity
+        else     SelCvtOfRes = IntInvalid;    // regular fcvt gives an overflow if out of range
+    end
+  else 
+    always_comb begin // no fcvtmod.w.d support
+        OfIntRes2 = OfIntRes;
+        Int64Res = CvtNegRes[P.XLEN-1:0];
+        SelCvtOfRes = IntInvalid;
+    end

  // select the integer output
  //      - if the input is invalid (out of bounds NaN or Inf) then output overflow res
@ -337,10 +359,10 @@ module specialcase import cvw::*;  #(parameter cvw_t P) (
  //          - otherwise output a rounded 0
  //      - otherwise output the normal res (trmined and sign extended if nessisary)
  always_comb
-    if(IntInvalid)          FCvtIntRes = OfIntRes;
+    if(SelCvtOfRes)         FCvtIntRes = OfIntRes2; 
    else if(CvtCe[P.NE]) 
      if(Xs&Signed&Plus1)   FCvtIntRes = {{P.XLEN{1'b1}}};
      else                  FCvtIntRes = {{P.XLEN-1{1'b0}}, Plus1};
-    else if(Int64)          FCvtIntRes = CvtNegRes[P.XLEN-1:0];
+    else if(Int64)          FCvtIntRes = Int64Res;
    else                    FCvtIntRes = {{P.XLEN-32{CvtNegRes[31]}}, CvtNegRes[31:0]};
 endmodule
--- a/src/mmu/mmu.sv
+++ b/src/mmu/mmu.sv
@ -140,7 +140,7 @@ module mmu import cvw::*;  #(parameter cvw_t P,
      2'b11:  DataMisalignedM = |VAdr[2:0];        // ld, sd, fld, fsd
    endcase 
  assign LoadMisalignedFaultM     = DataMisalignedM & ReadNoAmoAccessM & ~(P.ZICCLSM_SUPPORTED & Cacheable); 
-  assign StoreAmoMisalignedFaultM = DataMisalignedM & WriteAccessM & ~(P.ZICCLSM_SUPPORTED & Cacheable);
+  assign StoreAmoMisalignedFaultM = DataMisalignedM & WriteAccessM & (~(P.ZICCLSM_SUPPORTED & Cacheable) | ReadAccessM); // Misaligned AMO faults even if ZICCLSM supported

  // Specify which type of page fault is occurring
  assign InstrPageFaultF    = TLBPageFault & ExecuteAccessF;
--- a/testbench/common/instrNameDecTB.sv
+++ b/testbench/common/instrNameDecTB.sv
@ -298,6 +298,18 @@ module instrNameDecTB(
                       else if (funct7[6:2] == 5'b11100 & funct3 == 3'b001) name = "FCLASS";
                       else if (funct7[6:2] == 5'b00100 & funct3 == 3'b010) name = "FSGNJX";
                       else if (funct7[6:2] == 5'b10100 & funct3 == 3'b010) name = "FEQ";
+                       else if (funct7[6:2] == 5'b11110 & funct3 == 3'b000 & rs2 == 5'b00001) name = "FLI";
+                       else if (funct7[6:2] == 5'b00101 & funct3 == 3'b010) name = "FMINM";
+                       else if (funct7[6:2] == 5'b00101 & funct3 == 3'b011) name = "FMAXM";
+                       else if (funct7[6:2] == 5'b01000 & rs2 == 5'b00100) name = "FROUND";
+                       else if (funct7[6:2] == 5'b01000 & rs2 == 5'b00101) name = "FROUNDNX";
+                       else if (funct7[6:2] == 5'b10100 & funct3 == 3'b100) name = "FLEQ";
+                       else if (funct7[6:2] == 5'b10100 & funct3 == 3'b101) name = "FLTQ";
+                       else if (funct7 == 7'b1110001 & funct3 == 3'b000 & rs2 == 5'b00001) name = "FMVH.X.D";
+                       else if (funct7 == 7'b1110011 & funct3 == 3'b000 & rs2 == 5'b00001) name = "FMVH.X.Q";
+                       else if (funct7 == 7'b1011001 & funct3 == 3'b000) name = "FMVP.D.X";
+                       else if (funct7 == 7'b1011011 & funct3 == 3'b000) name = "FMVP.Q.X";
+                       else if (funct7 == 7'b1100001 & funct3 == 3'b001 & rs2 == 5'b01000) name = "FCVTMOD.W.D";
                       else                              name = "ILLEGAL";
      10'b0000111_010: name = "FLW";
      10'b0100111_010: name = "FSW";
--- a/testbench/testbench.sv
+++ b/testbench/testbench.sv
@ -128,7 +128,8 @@ module testbench;
        "arch64zicboz":  if (P.ZICBOZ_SUPPORTED)  tests = arch64zicboz;
        "arch64zcb":     if (P.ZCB_SUPPORTED)     tests = arch64zcb;
        "arch64zfh":     if (P.ZFH_SUPPORTED)     tests = arch64zfh;
-//        "arch64zfa":     if (P.ZFA_SUPPORTED)     tests = arch64zfa;
+        "arch64zfaf":    if (P.ZFA_SUPPORTED)     tests = arch64zfaf;
+        "arch64zfad":    if (P.ZFA_SUPPORTED & P.D_SUPPORTED)  tests = arch64zfad;
      endcase 
    end else begin // RV32
      case (TEST)
@ -165,6 +166,7 @@ module testbench;
        "arch32zcb":     if (P.ZCB_SUPPORTED)     tests = arch32zcb;
        "arch32zfh":     if (P.ZFH_SUPPORTED)     tests = arch32zfh;
        "arch32zfaf":    if (P.ZFA_SUPPORTED)     tests = arch32zfaf;
+        "arch32zfad":    if (P.ZFA_SUPPORTED & P.D_SUPPORTED)  tests = arch32zfad;
      endcase
    end
    if (tests.size() == 0) begin
@ -630,8 +632,8 @@ task automatic updateProgramAddrLabelArray;
    end
  end

-  if(ProgramAddrLabelArray["begin_signature"] == 0) $display("Couldn't find begin_signature in %s", ProgramLabelMapFile);
-  if(ProgramAddrLabelArray["sig_end_canary"] == 0) $display("Couldn't find sig_end_canary in %s", ProgramLabelMapFile);
+//  if(ProgramAddrLabelArray["begin_signature"] == 0) $display("Couldn't find begin_signature in %s", ProgramLabelMapFile);
+//  if(ProgramAddrLabelArray["sig_end_canary"] == 0) $display("Couldn't find sig_end_canary in %s", ProgramLabelMapFile);

  $fclose(ProgramLabelMapFP);
  $fclose(ProgramAddrMapFP);
--- a/testbench/tests.vh
+++ b/testbench/tests.vh
@ -1999,16 +1999,82 @@ string arch64zbs[] = '{

  string arch32zfaf[] = '{
    `RISCVARCHTEST,
-    "rv32i_m/F_Zfa/src/fle_b1-01.S",
-    "rv32i_m/F_Zfa/src/fle_b19-01.S",
-    "rv32i_m/F_Zfa/src/fli_b1-01.S",
+    "rv32i_m/F_Zfa/src/fleq_b1-01.S",
+    "rv32i_m/F_Zfa/src/fleq_b19-01.S", 
+    "rv32i_m/F_Zfa/src/fli.s-01.S",
    "rv32i_m/F_Zfa/src/fltq_b1-01.S",
    "rv32i_m/F_Zfa/src/fltq_b19-01.S",
-    "rv32i_m/F_Zfa/src/fmin_b1-01.S",
-    "rv32i_m/F_Zfa/src/fmin_b19-01.S",
-    "rv32i_m/F_Zfa/src/fmax_b1-01.S",
-    "rv32i_m/F_Zfa/src/fmax_b19-01.S",
-    "rv32i_m/F_Zfa/src/fround_b1-01.S"
+    "rv32i_m/D_Zfa/src/fltq_b1-01.S", // these D tests are more comprehensive and seem they should replace the F tests.  Applies to all F tests duplicated in D
+    "rv32i_m/D_Zfa/src/fltq_b19-01.S",
+    "rv32i_m/F_Zfa/src/fminm_b1-01.S",
+    "rv32i_m/F_Zfa/src/fminm_b19-01.S",
+    "rv32i_m/F_Zfa/src/fmaxm_b1-01.S",
+    "rv32i_m/F_Zfa/src/fmaxm_b19-01.S"
+/*    "rv32i_m/F_Zfa/src/fround_b1-01.S" */
+  };
+
+  string arch32zfad[] = '{
+    `RISCVARCHTEST,
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b22-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b23-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b24-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b27-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b28-01.S",
+    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b29-01.S",
+    "rv32i_m/D_Zfa/src/fleq_b1-01.S",
+    "rv32i_m/D_Zfa/src/fleq_b19-01.S", 
+    "rv32i_m/D_Zfa/src/fleq.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fleq.d_b19-01.S", 
+    "rv32i_m/D_Zfa/src/fli.d-01.S",
+    "rv32i_m/D_Zfa/src/fltq_b1-01.S",
+    "rv32i_m/D_Zfa/src/fltq_b19-01.S",
+    "rv32i_m/D_Zfa/src/fltq.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fltq.d_b19-01.S",
+    "rv32i_m/D_Zfa/src/fminm_b1-01.S",
+    "rv32i_m/D_Zfa/src/fminm_b19-01.S",
+    "rv32i_m/D_Zfa/src/fminm.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fminm.d_b19-01.S",
+    "rv32i_m/D_Zfa/src/fmaxm_b1-01.S",
+    "rv32i_m/D_Zfa/src/fmaxm_b19-01.S",
+    "rv32i_m/D_Zfa/src/fmaxm.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fmaxm.d_b19-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b1-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b22-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b23-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b24-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b27-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b28-01.S",
+    "rv32i_m/D_Zfa/src/fmvh.x.d_b29-01.S"
+/*    "rv32i_m/D_Zfa/src/fround_b1-01.S" */
+  };
+
+  string arch64zfaf[] = '{
+    `RISCVARCHTEST,
+    "rv64i_m/F_Zfa/src/fleq_b1-01.S",
+    "rv64i_m/F_Zfa/src/fleq_b19-01.S", 
+    "rv64i_m/F_Zfa/src/fli.s-01.S",
+    "rv64i_m/F_Zfa/src/fltq_b1-01.S",
+    "rv64i_m/F_Zfa/src/fltq_b19-01.S",
+    "rv64i_m/F_Zfa/src/fminm_b1-01.S",
+    "rv64i_m/F_Zfa/src/fminm_b19-01.S",
+    "rv64i_m/F_Zfa/src/fmaxm_b1-01.S",
+    "rv64i_m/F_Zfa/src/fmaxm_b19-01.S"
+/*    "rv64i_m/F_Zfa/src/fround_b1-01.S" */
+  };
+
+  string arch64zfad[] = '{
+    `RISCVARCHTEST,
+    "rv64i_m/D_Zfa/src/fleq_b1-01.S",
+    "rv64i_m/D_Zfa/src/fleq_b19-01.S", 
+    "rv64i_m/D_Zfa/src/fli.d-01.S",
+    "rv64i_m/D_Zfa/src/fltq_b1-01.S",
+    "rv64i_m/D_Zfa/src/fltq_b19-01.S",
+    "rv64i_m/D_Zfa/src/fminm_b1-01.S",
+    "rv64i_m/D_Zfa/src/fminm_b19-01.S",
+    "rv64i_m/D_Zfa/src/fmaxm_b1-01.S",
+    "rv64i_m/D_Zfa/src/fmaxm_b19-01.S"
+/*     "rv64i_m/D_Zfa/src/fround_b1-01.S" */
  };

  string arch32d_fma[] = '{
--- a/tests/coverage/csrwrites.S
+++ b/tests/coverage/csrwrites.S
@ -37,4 +37,31 @@ main:
    csrrw t1, menvcfg, t0
    csrrw t2, senvcfg, t0

+    # testing FIOM with different privelege modes
+    # setting environment config (to both 1 and 0) in each privelege mode
+    csrsi menvcfg, 1
+    li a0, 1
+    ecall               # enter supervisor mode
+
+    li a0, 0            
+    ecall               # enter user mode
+
+    li a0, 1
+    ecall               # enter supervisor mode
+
+    csrsi senvcfg, 1
+    li a0, 0
+    ecall               # enter user mode
+
+    li a0, 3
+    ecall               # enter machine mode
+    csrci menvcfg, 1
+
+    li a0, 1
+    ecall               # enter supervisor mode
+
+    li a0, 0
+    ecall               # enter user mode
+
+
    j done
--- a/tests/coverage/priv.S
+++ b/tests/coverage/priv.S
@ -297,7 +297,32 @@ sretdone:

    wfi

-    j done
+
+
+    # Test uncovered privdec instructions
+    li a0, 3
+    ecall
+    # exercise sfence.inval.ir instruction
+    .word 0x18100073
+
+    # exercise sret with rs1 not 0
+    .word 0x102F8073
+
+
+    # cover mret when mpp = 3 and mprv = 1
+    li a0, 3
+    ecall               # enter machine mode
+    bseti t0, zero, 17
+    csrs mstatus, t0    # set MPRV
+    li t1, 0x00001800   
+    csrs mstatus, t1    # set MPP=3
+    la t1, finished
+    csrr t0, mepc       
+    csrw mepc, t1       # set mepc for mret to jump to
+    mret
+
+
+finished: j done



--- a/tests/riscof/spike/riscof_spike.py
+++ b/tests/riscof/spike/riscof_spike.py
@ -115,6 +115,10 @@ class spike(pluginTemplate):
          self.isa += '_Zicond'
      if "Zicboz" in ispec["ISA"]:
          self.isa += '_Zicboz'
+      if "Zfa" in ispec["ISA"]:
+          self.isa += '_Zfa'
+      if "Zfh" in ispec["ISA"]:
+          self.isa += '_Zfh'
      if "Zca" in ispec["ISA"]:
          self.isa += '_Zca'
      if "Zcb" in ispec["ISA"]:
--- a/tests/riscof/spike/spike_rv32gc_isa.yaml
+++ b/tests/riscof/spike/spike_rv32gc_isa.yaml
@ -1,6 +1,6 @@
 hart_ids: [0]
 hart0:
-  ISA: RV32IMAFDCZicsr_Zicond_Zifencei_Zba_Zbb_Zbc_Zbs
+  ISA: RV32IMAFDCZicsr_Zicond_Zifencei_Zfa_Zfh_Zba_Zbb_Zbc_Zbs
 #  ISA: RV32IMAFDCZicsr_Zicboz_Zifencei_Zca_Zba_Zbb_Zbc_Zbs # _Zbkb_Zcb
  physical_addr_sz: 32
  User_Spec_Version: '2.3'
--- a/tests/riscof/spike/spike_rv64gc_isa.yaml
+++ b/tests/riscof/spike/spike_rv64gc_isa.yaml
@ -2,7 +2,7 @@ hart_ids: [0]
 hart0:
 #  ISA: RV64IMAFDCSUZicsr_Zicboz_Zifencei_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
 #  ISA: RV64IMAFDCSUZicsr_Zifencei_Zca_Zcb_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
-  ISA: RV64IMAFDCSUZicsr_Zicond_Zifencei_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
+  ISA: RV64IMAFDCSUZicsr_Zicond_Zifencei_Zfa_Zfh_Zba_Zbb_Zbc_Zbs # Zkbs_Zcb
  physical_addr_sz: 56
  User_Spec_Version: '2.3'
  supported_xlen: [64]
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-spi-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-spi-01.S
@ -607,6 +607,7 @@ SETUP_PLIC
 .4byte delay1, 0x0000001, write32_test      # reset delay1 register
 .4byte cs_mode, 0x00000000, write32_test    # reset cs_mode
 .4byte tx_mark, 0x00000001, write32_test    # set transmit watermark to 1 (any entry turns mark off)
+.4byte sck_div, 0x00000100, write32_test    # lower SPI clock rate so read32_tests trigger at correct times
 #.4byte ie, 0x00000000, write32_test         # enable transmit interrupt
 .4byte ip, 0x00000001, read32_test          # tx watermark interupt should be pending
 .4byte 0x0, 0x00000000, readmip_test
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-spi-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-spi-01.S
@ -608,6 +608,7 @@ SETUP_PLIC

 .8byte delay1, 0x0000001, write32_test      # reset delay1 register
 .8byte cs_mode, 0x00000000, write32_test    # reset cs_mode
+.8byte sck_div, 0x00000100, write32_test    # lower SPI clock rate so reads are done at correct time when ICACHE not supported
 .8byte tx_mark, 0x00000001, write32_test    # set transmit watermark to 1 (any entry turns mark off)
 #.8byte ie, 0x00000000, write32_test         # enable transmit interrupt
 .8byte ip, 0x00000001, read32_test          # tx watermark interupt should be pending