diff --git a/bin/derivgen.pl b/bin/derivgen.pl
index 442455c53..21ffc7019 100755
--- a/bin/derivgen.pl
+++ b/bin/derivgen.pl
@@ -90,7 +90,7 @@ foreach my $key (@derivnames) {
 
     my $datestring = localtime();
     my %hit = ();
-    print $fh "// Config $key automatically derived from $basederiv{$key} on $datestring usubg derivgen.pl\n";
+    print $fh "// Config $key automatically derived from $basederiv{$key} on $datestring using derivgen.pl\n";
     foreach my $line (<$unmod>) {
         foreach my $entry (@{$derivs{$key}}) {    
             my @ent = @{$entry};
diff --git a/bin/regression-wally b/bin/regression-wally
index 4e72fae66..f85800bba 100755
--- a/bin/regression-wally
+++ b/bin/regression-wally
@@ -99,6 +99,29 @@ derivconfigtests = [
         ["zaamo_rv32gc", ["arch32i", "arch32a_amo"]],
         ["zalrsc_rv32gc", ["arch32i", "wally32a_lrsc"]],
 
+# Bit manipulation and crypto variants
+        ["zba_rv32gc", ["arch32i", "arch32zba"]],
+        ["zbb_rv32gc", ["arch32i", "arch32zbb"]],
+        ["zbc_rv32gc", ["arch32i", "arch32zbc"]],
+        ["zbs_rv32gc", ["arch32i", "arch32zbs"]],
+        ["zbkb_rv32gc", ["arch32i", "arch32zbkb"]],
+        ["zbkc_rv32gc", ["arch32i", "arch32zbkc"]],
+        ["zbkx_rv32gc", ["arch32i", "arch32zbkx"]],
+        ["zkne_rv32gc", ["arch32i", "arch32zkne"]],
+        ["zknd_rv32gc", ["arch32i", "arch32zknd"]],
+        ["zknh_rv32gc", ["arch32i", "arch32zknh"]],
+
+        ["zba_rv64gc", ["arch64i", "arch64zba"]],
+        ["zbb_rv64gc", ["arch64i", "arch64zbb"]],
+        ["zbc_rv64gc", ["arch64i", "arch64zbc"]],
+        ["zbs_rv64gc", ["arch64i", "arch64zbs"]],
+        ["zbkb_rv64gc", ["arch64i", "arch64zbkb"]],
+        ["zbkc_rv64gc", ["arch64i", "arch64zbkc"]],
+        ["zbkx_rv64gc", ["arch64i", "arch64zbkx"]],
+        ["zkne_rv64gc", ["arch64i", "arch64zkne"]],
+        ["zknd_rv64gc", ["arch64i", "arch64zknd"]],
+        ["zknh_rv64gc", ["arch64i", "arch64zknh"]],
+
         ### add misaligned tests
 
         # fp/int divider permutations
@@ -325,7 +348,8 @@ else:
 
     # run derivative configurations in nightly regression
 if (nightly):
-    addTests(tests_buildrootboot, defaultsim)
+#    addTests(tests_buildrootboot, defaultsim)
+    addTests(tests_buildrootshort, defaultsim)
     addTests(derivconfigtests, defaultsim)
 else:
     addTests(tests_buildrootshort, defaultsim)
@@ -389,7 +413,7 @@ if (testfloat or nightly): # for nightly, run testfloat along with othres
             tc = TestCase(
                     name=test,
                     variant=config,
-                    cmd="wsim --tb testbench_fp --sim questa " + config + " " + test + " > " + sim_log,
+                    cmd="wsim --tb testbench_fp " + config + " " + test + " > " + sim_log,
                     grepstr="All Tests completed with          0 errors",
                     grepfile = WALLY + "/sim/questa/logs/"+config+"_"+test+".log")
             configs.append(tc)
@@ -415,7 +439,7 @@ def main():
     elif '--nightly' in sys.argv:
         TIMEOUT_DUR = 60*1440 # 1 day
     elif '--testfloat' in sys.argv:
-        TIMEOUT_DUR = 5*60 # seconds
+        TIMEOUT_DUR = 30*60 # seconds
     else:
         TIMEOUT_DUR = 10*60 # seconds
 
diff --git a/bin/testcount.pl b/bin/testcount.pl
index 139902dd3..7ac75d676 100755
--- a/bin/testcount.pl
+++ b/bin/testcount.pl
@@ -34,6 +34,10 @@ for dir in `ls ${WALLY}/addins/riscv-arch-test/riscv-test-suite/rv*/*`
 do
     dir=$(echo $dir | cut -d':' -f1)
     echo $dir
+    if [ $dir == "src" ]
+    then
+        continue
+    fi
     for fn in `ls $dir/src/*.S`
     do
         result=`grep 'inst_' $fn | tail -n 1`
diff --git a/config/derivlist.txt b/config/derivlist.txt
index 174ca5191..d2c939a77 100644
--- a/config/derivlist.txt
+++ b/config/derivlist.txt
@@ -296,9 +296,6 @@ RAS_SIZE          32'd6
 deriv bpred_GSHARE_10_10_10_1_rv32gc rv32gc
 RAS_SIZE          32'd10
 
-deriv bpred_GSHARE_10_16_10_1_rv32gc rv32gc
-RAS_SIZE          32'd16
-
 deriv bpred_GSHARE_10_16_6_1_rv32gc rv32gc
 BTB_SIZE          32'd6
 
@@ -368,9 +365,6 @@ INSTR_CLASS_PRED          0
 deriv bpred_GSHARE_10_10_10_0_rv32gc bpred_GSHARE_10_10_10_1_rv32gc
 INSTR_CLASS_PRED          0
 
-deriv bpred_GSHARE_10_16_10_0_rv32gc bpred_GSHARE_10_16_10_1_rv32gc
-INSTR_CLASS_PRED          0
-
 deriv bpred_GSHARE_10_16_6_0_rv32gc bpred_GSHARE_10_16_6_1_rv32gc
 INSTR_CLASS_PRED          0
 
diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index 4cf17890c..ba986dadc 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -281,7 +281,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
 
     // fround
     fround #(P) fround(.X(XE), .Xs(XsE), .Xe(XeE), .Xm(XmE), 
-                       .XNaN(XNaNE), .XSNaN(XSNaNE), .XZero(XZeroE), .Fmt(FmtE), .Frm(FrmE), .Nf(NfE), 
+                       .XNaN(XNaNE), .XSNaN(XSNaNE), .Fmt(FmtE), .Frm(FrmE), .Nf(NfE), 
                        .ZfaFRoundNX(ZfaFRoundNXE),
                        .FRound(FRoundE), .FRoundNV(FRoundNVE), .FRoundNX(FRoundNXE));
 
diff --git a/src/fpu/fround.sv b/src/fpu/fround.sv
index 64700834a..d905618ba 100644
--- a/src/fpu/fround.sv
+++ b/src/fpu/fround.sv
@@ -34,7 +34,6 @@ module fround import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.NF:0]           Xm,           // input's fraction with leading integer bit (U1.NF)
   input  logic                    XNaN,         // X is NaN
   input  logic                    XSNaN,        // X is Signalling NaN
-  input  logic                    XZero,        // X is Zero
   input  logic [P.FMTBITS-1:0]    Fmt,          // the input's precision (11=quad 01=double 00=single 10=half)
   input  logic [2:0]              Frm,          // rounding mode
   input  logic [P.LOGFLEN-1:0]    Nf,           // Number of fractional bits in selected format
@@ -44,10 +43,10 @@ module fround import cvw::*;  #(parameter cvw_t P) (
   output logic                    FRoundNX      // fround inexact
 );
 
-  logic [P.NE-1:0] E, Xep1, EminusNf;
+  logic [P.NE-1:0] E, Xep1;
   logic [P.NF:0] IMask, Tmasknonneg, Tmaskneg, Tmask, HotE, HotEP1, Trunc, Rnd;
   logic [P.FLEN-1:0] W, PackedW;
-  logic Elt0, Eeqm1, Lnonneg, Lp, Rnonneg, Rp, Tp, RoundUp, Two, EgeNf, Exact;
+  logic Elt0, Eeqm1, Lnonneg, Lp, Rnonneg, Rp, Tp, RoundUp, Two, EgeNf;
 
   // Unbiased exponent
   assign E = Xe - P.BIAS[P.NE-1:0];
@@ -78,7 +77,7 @@ module fround import cvw::*;  #(parameter cvw_t P) (
   assign Eeqm1 = ($signed(E) == -1);
 
   // Logic for nonnegative mask and rounding bits
-  assign IMask = {1'b1, {P.NF{1'b0}}} >>> E;
+  assign IMask = {1'b1, {P.NF{1'b0}}} >>> E; /// if E > Nf, this produces all 0s instead of all 1s.  Hence exact handling is needed below.
   assign Tmasknonneg = ~IMask >>> 1'b1;
   assign HotE = IMask & ~(IMask << 1'b1);
   assign HotEP1 = HotE >> 1'b1;
@@ -100,7 +99,7 @@ module fround import cvw::*;  #(parameter cvw_t P) (
   //      if (X is NaN)
   //              W = Canonical NaN
   //              Invalid = (X is signaling NaN)
-  //      else if (E >= Nf or X is +/- 0) 
+  //      else if (E >= Nf) 
   //              W = X						// is exact; this also handles infinity
   //      else 
   //              RoundUp = RoundingLogic(Xs, L', R', T', rm)	// Table 16.4
@@ -117,11 +116,9 @@ module fround import cvw::*;  #(parameter cvw_t P) (
   ///////////////////////////
 
   // Exact logic
-  /* verilator lint_off WIDTH */
-  assign EminusNf = E - Nf;
-  /* verilator lint_on WIDTH */
-  assign EgeNf = ~EminusNf[P.NE-1] & (~E[P.NE-1] | E[P.NE-2:0] == '0); // E >= Nf if MSB of E-Nf is 0 and E was positive 
-  assign Exact = (EgeNf | XZero) & ~XNaN; // result will be exact; no need to round
+  // verilator lint_off WIDTHEXPAND
+  assign EgeNf = (E >= Nf) & Xe[P.NE-1]; // Check if E >= Nf.  Also check that Xe is positive to avoid wraparound problems
+  // verilator lint_on WIDTHEXPAND
 
   // Rounding logic: determine whether to round up in magnitude
   always_comb begin
@@ -135,22 +132,22 @@ module fround import cvw::*;  #(parameter cvw_t P) (
     endcase
 
     // If result is not exact, select output in unpacked FLEN format initially
-    if (XNaN) W = {1'b0, {P.NE{1'b1}}, 1'b1, {(P.NF-1){1'b0}}}; // Canonical NaN
-    else if (Elt0) // 0 <= |X| < 1 rounds to 0 or 1
-      if (RoundUp) W = {Xs, P.BIAS[P.NE-1:0], {P.NF{1'b0}}}; // round to +/- 1
-      else         W = {Xs, {(P.FLEN-1){1'b0}}}; // round to +/- 0
-    else begin // |X| >= 1 rounds to an integer
-      if (RoundUp & Two) W = {Xs, Xep1, {(P.NF){1'b0}}}; // Round up to 2.0
-      else if (RoundUp)  W = {Xs, Xe, Rnd[P.NF-1:0]};      // Round up to Rnd
-      else               W = {Xs, Xe, Trunc[P.NF-1:0]};    // Round down to Trunc
+    if (XNaN)            W = {1'b0, {P.NE{1'b1}}, 1'b1, {(P.NF-1){1'b0}}};  // Canonical NaN
+    else if (EgeNf)      W = {Xs, Xe, Xm[P.NF-1:0]};                        // Exact, no rounding needed
+    else if (Elt0)                                                          // 0 <= |X| < 1 rounds to 0 or 1
+      if (RoundUp)       W = {Xs, P.BIAS[P.NE-1:0], {P.NF{1'b0}}};          //   round to +/- 1
+      else               W = {Xs, {(P.FLEN-1){1'b0}}};                      //   round to +/- 0
+    else begin                                                              // |X| >= 1 rounds to an integer
+      if (RoundUp & Two) W = {Xs, Xep1, {(P.NF){1'b0}}};                    //   Round up to 2.0
+      else if (RoundUp)  W = {Xs, Xe, Rnd[P.NF-1:0]};                       //   Round up to Rnd
+      else               W = {Xs, Xe, Trunc[P.NF-1:0]};                     //   Round down to Trunc
     end
   end
 
-  packoutput #(P) packoutput(W, Fmt, PackedW); // pack and NaN-box based on selected format.
-  mux2 #(P.FLEN) resultmux(PackedW, X, Exact, FRound);
+  packoutput #(P) packoutput(W, Fmt, FRound); // pack and NaN-box based on selected format.
 
   // Flags
-  assign FRoundNV = XSNaN;                                        // invalid if input is signaling NaN
-  assign FRoundNX = ZfaFRoundNX & ~(XNaN | Exact) & (Rp | Tp);    // Inexact if Round or Sticky bit set for FRoundNX instruction
+  assign FRoundNV = XSNaN;                               // invalid if input is signaling NaN
+  assign FRoundNX = ZfaFRoundNX & ~EgeNf & (Rp | Tp);    // Inexact if Round or Sticky bit set for FRoundNX instruction
 
 endmodule
diff --git a/src/fpu/postproc/divshiftcalc.sv b/src/fpu/postproc/divshiftcalc.sv
index 0a222d724..d45afeea6 100644
--- a/src/fpu/postproc/divshiftcalc.sv
+++ b/src/fpu/postproc/divshiftcalc.sv
@@ -28,10 +28,8 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.DIVb:0]              DivUm,              // divsqrt significand
   input  logic [P.NE+1:0]              DivUe,              // divsqrt exponent
   output logic [P.LOGNORMSHIFTSZ-1:0]  DivShiftAmt,        // divsqrt shift amount
-  output logic [P.NORMSHIFTSZ-1:0]     DivShiftIn,         // divsqrt shift input
   output logic                         DivResSubnorm,      // is the divsqrt result subnormal
   output logic                         DivSubnormShiftPos  // is the subnormal shift amount positive
 );
@@ -68,6 +66,4 @@ module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
   assign DivSubnormShiftAmt = DivSubnormShiftPos ? DivSubnormShift[P.LOGNORMSHIFTSZ-1:0] : '0;
   assign DivShiftAmt        = DivResSubnorm ? DivSubnormShiftAmt : NormShift;
 
-  // pre-shift the divider result for normalization
-  assign DivShiftIn = {{P.NF{1'b0}}, DivUm, {P.NORMSHIFTSZ-P.DIVb-1-P.NF{1'b0}}};
 endmodule
diff --git a/src/fpu/postproc/fmashiftcalc.sv b/src/fpu/postproc/fmashiftcalc.sv
index 27f39e2a5..cf334aa9b 100644
--- a/src/fpu/postproc/fmashiftcalc.sv
+++ b/src/fpu/postproc/fmashiftcalc.sv
@@ -28,18 +28,17 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fmashiftcalc import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.FMTBITS-1:0]         Fmt,                 // precision 1 = double 0 = single
-  input  logic [P.NE+1:0]              FmaSe,               // sum's exponent
-  input  logic [P.FMALEN-1:0]          FmaSm,               // the positive sum
+  input  logic [P.FMTBITS-1:0]          Fmt,                 // precision 1 = double 0 = single
+  input  logic [P.NE+1:0]               FmaSe,               // sum's exponent
+  input  logic [P.FMALEN-1:0]           FmaSm,               // the positive sum
   input  logic [$clog2(P.FMALEN+1)-1:0] FmaSCnt,             // normalization shift count
-  output logic [P.NE+1:0]              NormSumExp,          // exponent of the normalized sum not taking into account Subnormal or zero results
-  output logic                         FmaSZero,            //  is the sum zero
-  output logic                         FmaPreResultSubnorm, // is the result subnormal - calculated before LZA corection
-  output logic [$clog2(P.FMALEN+1)-1:0] FmaShiftAmt,         // normalization shift count
-  output logic [P.FMALEN+1:0]          FmaShiftIn           
+  output logic [P.NE+1:0]               NormSumExp,          // exponent of the normalized sum not taking into account Subnormal or zero results
+  output logic                          FmaSZero,            // is the sum zero
+  output logic                          FmaPreResultSubnorm, // is the result subnormal - calculated before LZA corection
+  output logic [$clog2(P.FMALEN+1)-1:0] FmaShiftAmt          // normalization shift count
 );
-  logic [P.NE+1:0]                     PreNormSumExp;       // the exponent of the normalized sum with the P.FLEN bias
-  logic [P.NE+1:0]                     BiasCorr;            // correction for bias
+  logic [P.NE+1:0]                      PreNormSumExp;       // the exponent of the normalized sum with the P.FLEN bias
+  logic [P.NE+1:0]                      BiasCorr;            // correction for bias
 
   ///////////////////////////////////////////////////////////////////////////////
   // Normalization
@@ -54,6 +53,7 @@ module fmashiftcalc import cvw::*;  #(parameter cvw_t P) (
   //convert the sum's exponent into the proper precision
   if (P.FPSIZES == 1) begin
     assign NormSumExp = PreNormSumExp;
+    assign BiasCorr = '0;
   end else if (P.FPSIZES == 2) begin
     assign BiasCorr = Fmt ? (P.NE+2)'(0) : (P.NE+2)'(P.BIAS1-P.BIAS);
     assign NormSumExp = PreNormSumExp+BiasCorr;
@@ -79,19 +79,19 @@ module fmashiftcalc import cvw::*;  #(parameter cvw_t P) (
     assign NormSumExp = PreNormSumExp+BiasCorr;
   end
   
-  // determine if the result is subnormal: (NormSumExp <= 0) & (NormSumExp >= -FracLen) & ~FmaSZero
+  // determine if the result is subnormal: (NormSumExp <= 0) & (NormSumExp >= -FracLen)
   if (P.FPSIZES == 1) begin
     logic Sum0LEZ, Sum0GEFL;
     assign Sum0LEZ  = PreNormSumExp[P.NE+1] | ~|PreNormSumExp;
     assign Sum0GEFL = $signed(PreNormSumExp) >= $signed((P.NE+2)'(-P.NF-1)); // changed from -2 dh 4/3/24 for issue 655
-    assign FmaPreResultSubnorm = Sum0LEZ & Sum0GEFL & ~FmaSZero;
+    assign FmaPreResultSubnorm = Sum0LEZ & Sum0GEFL;
   end else if (P.FPSIZES == 2) begin
     logic Sum0LEZ, Sum0GEFL, Sum1LEZ, Sum1GEFL;
     assign Sum0LEZ  = PreNormSumExp[P.NE+1] | ~|PreNormSumExp;
     assign Sum0GEFL = $signed(PreNormSumExp) >= $signed((P.NE+2)'(-P.NF-1)); // changed from -2 dh 4/3/24 for issue 655
     assign Sum1LEZ  = $signed(PreNormSumExp) <= $signed((P.NE+2)'(P.BIAS-P.BIAS1));
     assign Sum1GEFL = $signed(PreNormSumExp) >= $signed((P.NE+2)'(-P.NF1-1+P.BIAS-P.BIAS1)) | ~|PreNormSumExp;
-    assign FmaPreResultSubnorm = (Fmt ? Sum0LEZ : Sum1LEZ) & (Fmt ? Sum0GEFL : Sum1GEFL) & ~FmaSZero;
+    assign FmaPreResultSubnorm = (Fmt ? Sum0LEZ : Sum1LEZ) & (Fmt ? Sum0GEFL : Sum1GEFL);
   end else if (P.FPSIZES == 3) begin
     logic Sum0LEZ, Sum0GEFL, Sum1LEZ, Sum1GEFL, Sum2LEZ, Sum2GEFL;
     assign Sum0LEZ  = PreNormSumExp[P.NE+1] | ~|PreNormSumExp;
@@ -102,9 +102,9 @@ module fmashiftcalc import cvw::*;  #(parameter cvw_t P) (
     assign Sum2GEFL = $signed(PreNormSumExp) >= $signed((P.NE+2)'(-P.NF2-1+P.BIAS-P.BIAS2)) | ~|PreNormSumExp;
     always_comb begin
       case (Fmt)
-        P.FMT: FmaPreResultSubnorm   = Sum0LEZ & Sum0GEFL; // & ~FmaSZero; // checking sum is not zero is harmless but turns out to be unnecessary
-        P.FMT1: FmaPreResultSubnorm  = Sum1LEZ & Sum1GEFL; // & ~FmaSZero;
-        P.FMT2: FmaPreResultSubnorm  = Sum2LEZ & Sum2GEFL; // & ~FmaSZero; 
+        P.FMT: FmaPreResultSubnorm   = Sum0LEZ & Sum0GEFL;
+        P.FMT1: FmaPreResultSubnorm  = Sum1LEZ & Sum1GEFL;
+        P.FMT2: FmaPreResultSubnorm  = Sum2LEZ & Sum2GEFL;
         default: FmaPreResultSubnorm = 1'bx;
       endcase
     end
@@ -120,17 +120,15 @@ module fmashiftcalc import cvw::*;  #(parameter cvw_t P) (
     assign Sum3GEFL = $signed(PreNormSumExp) >= $signed((P.NE+2)'(-P.H_NF-1+P.BIAS-P.H_BIAS)) | ~|PreNormSumExp;
     always_comb begin
       case (Fmt)
-        2'h3: FmaPreResultSubnorm = Sum0LEZ & Sum0GEFL & ~FmaSZero;
-        2'h1: FmaPreResultSubnorm = Sum1LEZ & Sum1GEFL & ~FmaSZero;
-        2'h0: FmaPreResultSubnorm = Sum2LEZ & Sum2GEFL & ~FmaSZero;
-        2'h2: FmaPreResultSubnorm = Sum3LEZ & Sum3GEFL & ~FmaSZero;
+        2'h3: FmaPreResultSubnorm = Sum0LEZ & Sum0GEFL;
+        2'h1: FmaPreResultSubnorm = Sum1LEZ & Sum1GEFL;
+        2'h0: FmaPreResultSubnorm = Sum2LEZ & Sum2GEFL;
+        2'h2: FmaPreResultSubnorm = Sum3LEZ & Sum3GEFL;
       endcase
     end
   end
 
   // set and calculate the shift input and amount
   //  - shift once if killing a product and the result is subnormal
-  assign FmaShiftIn = {2'b0, FmaSm};
-  if (P.FPSIZES == 1) assign FmaShiftAmt = FmaPreResultSubnorm ? FmaSe[$clog2(P.FMALEN-1)-1:0]+($clog2(P.FMALEN-1))'(P.NF+3): FmaSCnt+1;
-  else                assign FmaShiftAmt = FmaPreResultSubnorm ? FmaSe[$clog2(P.FMALEN-1)-1:0]+($clog2(P.FMALEN-1))'(P.NF+3)+BiasCorr[$clog2(P.FMALEN-1)-1:0]: FmaSCnt+1;
+  assign FmaShiftAmt = FmaPreResultSubnorm ? FmaSe[$clog2(P.FMALEN-1)-1:0]+($clog2(P.FMALEN-1))'(P.NF+3)+BiasCorr[$clog2(P.FMALEN-1)-1:0]: FmaSCnt+1;
 endmodule
diff --git a/src/fpu/postproc/postprocess.sv b/src/fpu/postproc/postprocess.sv
index 4e893a82e..2db03cb16 100644
--- a/src/fpu/postproc/postprocess.sv
+++ b/src/fpu/postproc/postprocess.sv
@@ -44,7 +44,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   input logic                              FmaPs,               // the product's sign
   input logic                              FmaSs,               // Sum sign
   input logic  [P.NE+1:0]                  FmaSe,               // the sum's exponent
-  input logic  [P.FMALEN-1:0]                FmaSm,               // the positive sum
+  input logic  [P.FMALEN-1:0]              FmaSm,               // the positive sum
   input logic                              FmaASticky,          // sticky bit that is calculated during alignment
   input logic  [$clog2(P.FMALEN+1)-1:0]      FmaSCnt,             // the normalization shift count
   //divide signals
@@ -86,13 +86,11 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   // fma signals
   logic [P.NE+1:0]             FmaMe;                // exponent of the normalized sum
   logic                        FmaSZero;             // is the sum zero
-  logic [P.FMALEN+1:0]         FmaShiftIn;           // fma shift input
   logic [P.NE+1:0]             NormSumExp;           // exponent of the normalized sum not taking into account Subnormal or zero results
   logic                        FmaPreResultSubnorm;  // is the result subnormal - calculated before LZA corection
   logic [$clog2(P.FMALEN+1)-1:0] FmaShiftAmt;          // normalization shift amount for fma
   // division signals
   logic [P.LOGNORMSHIFTSZ-1:0] DivShiftAmt;          // divsqrt shif amount
-  logic [P.NORMSHIFTSZ-1:0]    DivShiftIn;           // divsqrt shift input
   logic [P.NE+1:0]             Ue;                   // divsqrt corrected exponent after corretion shift
   logic                        DivByZero;            // divide by zero flag
   logic                        DivResSubnorm;        // is the divsqrt result subnormal
@@ -145,17 +143,17 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   cvtshiftcalc #(P) cvtshiftcalc(.ToInt, .CvtCe, .CvtResSubnormUf, .Xm, .CvtLzcIn,  
       .XZero, .IntToFp, .OutFmt, .CvtResUf, .CvtShiftIn);
 
-  fmashiftcalc #(P) fmashiftcalc(.FmaSm, .FmaSCnt, .Fmt, .NormSumExp, .FmaSe,
-      .FmaSZero, .FmaPreResultSubnorm, .FmaShiftAmt, .FmaShiftIn);
+  fmashiftcalc #(P) fmashiftcalc(.FmaSCnt, .Fmt, .NormSumExp, .FmaSe, .FmaSm,
+      .FmaSZero, .FmaPreResultSubnorm, .FmaShiftAmt);
 
-  divshiftcalc #(P) divshiftcalc(.DivUe, .DivUm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
+  divshiftcalc #(P) divshiftcalc(.DivUe, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt);
 
   // select which unit's output to shift
   always_comb
     case(PostProcSel)
       2'b10: begin // fma
         ShiftAmt = {{P.LOGNORMSHIFTSZ-$clog2(P.FMALEN-1){1'b0}}, FmaShiftAmt};
-        ShiftIn  =  {FmaShiftIn, {P.NORMSHIFTSZ-(P.FMALEN+2){1'b0}}};
+        ShiftIn  =  {{2'b00, FmaSm}, {P.NORMSHIFTSZ-(P.FMALEN+2){1'b0}}};
       end
       2'b00: begin // cvt
         ShiftAmt = {{P.LOGNORMSHIFTSZ-$clog2(P.CVTLEN+1){1'b0}}, CvtShiftAmt};
@@ -163,7 +161,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
       end
       2'b01: begin //divsqrt
         ShiftAmt = DivShiftAmt;
-        ShiftIn  =  DivShiftIn;
+        ShiftIn  = {{P.NF{1'b0}}, DivUm, {P.NORMSHIFTSZ-P.DIVb-1-P.NF{1'b0}}};
       end
       default: begin 
         ShiftAmt = {P.LOGNORMSHIFTSZ{1'bx}}; 
diff --git a/src/ieu/alu.sv b/src/ieu/alu.sv
index e1cae73a6..74eb6f7f6 100644
--- a/src/ieu/alu.sv
+++ b/src/ieu/alu.sv
@@ -109,7 +109,9 @@ module alu import cvw::*; #(parameter cvw_t P) (
   else              assign PreALUResult = FullResult;
 
   // Bit manipulation muxing
-  if (P.ZBC_SUPPORTED | P.ZBS_SUPPORTED | P.ZBA_SUPPORTED | P.ZBB_SUPPORTED | P.ZBKB_SUPPORTED | P.ZBKC_SUPPORTED | P.ZBKX_SUPPORTED | P.ZKND_SUPPORTED | P.ZKNE_SUPPORTED | P.ZKNH_SUPPORTED) begin : bitmanipalu
+  if (P.ZBC_SUPPORTED  | P.ZBS_SUPPORTED  | P.ZBA_SUPPORTED  | P.ZBB_SUPPORTED |
+      P.ZBKB_SUPPORTED | P.ZBKC_SUPPORTED | P.ZBKX_SUPPORTED | 
+      P.ZKND_SUPPORTED | P.ZKNE_SUPPORTED | P.ZKNH_SUPPORTED) begin : bitmanipalu
     bitmanipalu #(P) balu(
       .A, .B, .W64, .BSelect, .ZBBSelect, .BMUActive,
       .Funct3, .Funct7, .Rs2E, .LT,.LTU, .BALUControl, .PreALUResult, .FullResult,
diff --git a/src/ieu/bmu/bitmanipalu.sv b/src/ieu/bmu/bitmanipalu.sv
index 36feff63e..76734f97f 100644
--- a/src/ieu/bmu/bitmanipalu.sv
+++ b/src/ieu/bmu/bitmanipalu.sv
@@ -87,18 +87,23 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P) (
   end
 
   // Bit reverse needed for some ZBB, ZBC instructions
-  if (P.ZBC_SUPPORTED | P.ZBB_SUPPORTED) begin: bitreverse
+  if (P.ZBC_SUPPORTED | P.ZBKC_SUPPORTED | P.ZBB_SUPPORTED) begin: bitreverse
     bitreverse #(P.XLEN) brA(.A(ABMU), .RevA);
   end
 
   // ZBC and ZBKCUnit
   if (P.ZBC_SUPPORTED | P.ZBKC_SUPPORTED) begin: zbc
-    zbc #(P.XLEN) ZBC(.A(ABMU), .RevA, .B(BBMU), .Funct3, .ZBCResult);
+    zbc #(P) ZBC(.A(ABMU), .RevA, .B(BBMU), .Funct3, .ZBCResult);
   end else assign ZBCResult = '0;
 
   // ZBB Unit
   if (P.ZBB_SUPPORTED) begin: zbb
     zbb #(P.XLEN) ZBB(.A(ABMU), .RevA, .B(BBMU), .W64, .LT, .LTU, .BUnsigned(Funct3[0]), .ZBBSelect(ZBBSelect[2:0]), .ZBBResult);
+  end else if (P.ZBKB_SUPPORTED) begin: zbkbonly // only needs rev8 portion
+    genvar i;
+    for (i=0;i<P.XLEN;i+=8) begin:byteloop
+      assign ZBBResult[P.XLEN-i-1:P.XLEN-i-8] = ABMU[i+7:i]; // Rev8
+    end
   end else assign ZBBResult = '0;
 
   // ZBKB Unit
diff --git a/src/ieu/bmu/bmuctrl.sv b/src/ieu/bmu/bmuctrl.sv
index 76e08aba8..97a0caa45 100644
--- a/src/ieu/bmu/bmuctrl.sv
+++ b/src/ieu/bmu/bmuctrl.sv
@@ -98,10 +98,6 @@ module bmuctrl import cvw::*;  #(parameter cvw_t P) (
                                   BMUControlsD = `BMUCTRLW'b000_0010_0001_1_1_0_1_0_0_0_0_0;  // sign extend instruction
                                 else if ((Rs2D[4:2]==3'b000) & ~(Rs2D[1] & Rs2D[0]))
                                   BMUControlsD = `BMUCTRLW'b000_0010_0000_1_1_0_1_0_0_0_0_0;  // count instruction
-//        // coverage off: This case can't occur in RV64
-//        17'b0110011_0000100_100: if (P.XLEN == 32)
-//                                  BMUControlsD = `BMUCTRLW'b000_10_001_1_1_0_1_0_0_0_0_0;  // zexth (rv32)
-//        // coverage on
         17'b0010011_0010100_101: if (Rs2D[4:0] == 5'b00111)
                                   BMUControlsD = `BMUCTRLW'b000_0010_0010_1_1_0_1_0_0_0_0_0;  // orc.b
         17'b0110011_0000101_110: BMUControlsD = `BMUCTRLW'b000_0010_0111_1_0_0_1_1_0_0_0_0;  // max
@@ -124,12 +120,13 @@ module bmuctrl import cvw::*;  #(parameter cvw_t P) (
     if (P.ZBC_SUPPORTED)
       casez({OpD, Funct7D, Funct3D})
         17'b0110011_0000101_010: BMUControlsD = `BMUCTRLW'b000_0011_0001_1_0_0_1_0_0_0_0_0;  // clmulr
-        17'b0110011_0000101_0??: BMUControlsD = `BMUCTRLW'b000_0011_0000_1_0_0_1_0_0_0_0_0;  // ZBC instruction
+        17'b0110011_0000101_0??: BMUControlsD = `BMUCTRLW'b000_0011_0000_1_0_0_1_0_0_0_0_0;  // clmul/clmulh
       endcase
-    if (P.ZBKC_SUPPORTED | P.ZBC_SUPPORTED) begin   
+    if (P.ZBKC_SUPPORTED) begin   
       casez({OpD, Funct7D, Funct3D})
-        17'b0110011_0000101_001: BMUControlsD = `BMUCTRLW'b000_0011_0000_1_0_0_1_0_0_0_0_0;  // clmul
-        17'b0110011_0000101_011: BMUControlsD = `BMUCTRLW'b000_0011_0001_1_0_0_1_0_0_0_0_0;  // clmulh
+        17'b0110011_0000101_0??: BMUControlsD = `BMUCTRLW'b000_0011_0000_1_0_0_1_0_0_0_0_0;  // clmul/clmulh
+        //  17'b0110011_0000101_001: BMUControlsD = `BMUCTRLW'b000_0011_0000_1_0_0_1_0_0_0_0_0;  // clmul
+        // 17'b0110011_0000101_011: BMUControlsD = `BMUCTRLW'b000_0011_0001_1_0_0_1_0_0_0_0_0;  // clmulh
       endcase
     end
 
diff --git a/src/ieu/bmu/zbc.sv b/src/ieu/bmu/zbc.sv
index 6e1948c33..cb63eb85a 100644
--- a/src/ieu/bmu/zbc.sv
+++ b/src/ieu/bmu/zbc.sv
@@ -28,23 +28,31 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module zbc #(parameter WIDTH=32) (
-  input  logic [WIDTH-1:0] A, RevA, B,       // Operands
-  input  logic [2:0]       Funct3,           // Indicates operation to perform
-  output logic [WIDTH-1:0] ZBCResult);       // ZBC result
+module zbc import cvw::*; #(parameter cvw_t P) (
+  input  logic [P.XLEN-1:0] A, RevA, B,       // Operands
+  input  logic [2:0]        Funct3,           // Indicates operation to perform
+  output logic [P.XLEN-1:0] ZBCResult);       // ZBC result
 
-  logic [WIDTH-1:0] ClmulResult, RevClmulResult;
-  logic [WIDTH-1:0] RevB;
-  logic [WIDTH-1:0] X, Y;
+  logic [P.XLEN-1:0] ClmulResult, RevClmulResult;
+  logic [P.XLEN-1:0] RevB;
+  logic [P.XLEN-1:0] X, Y;
 
-  bitreverse #(WIDTH) brB(B, RevB);
+  bitreverse #(P.XLEN) brB(B, RevB);
 
-  mux3 #(WIDTH) xmux({RevA[WIDTH-2:0], {1'b0}}, RevA, A, ~Funct3[1:0], X);
-  mux2 #(WIDTH) ymux(RevB, B, ~Funct3[1], Y);
+  // choose X = A for clmul, Rev(A) << 1 for clmulh, Rev(A) for clmulr
+  // unshifted Rev(A) source is only needed for clmulr in ZBC, not in ZBKC
+  if (P.ZBC_SUPPORTED)
+    mux3 #(P.XLEN) xmux({RevA[P.XLEN-2:0], {1'b0}}, RevA, A, ~Funct3[1:0], X);
+  else
+    mux2 #(P.XLEN) xmux(A, {RevA[P.XLEN-2:0], {1'b0}}, Funct3[1], X);
 
-  clmul #(WIDTH) clm(.X, .Y, .ClmulResult);
-  
-  bitreverse  #(WIDTH) brClmulResult(ClmulResult, RevClmulResult);
+  // choose X = B for clmul, Rev(B) for clmulH
+  mux2 #(P.XLEN) ymux(B, RevB, Funct3[1], Y);
 
-  mux2 #(WIDTH) zbcresultmux(ClmulResult, RevClmulResult, Funct3[1], ZBCResult);
+  // carry free multiplier
+  clmul #(P.XLEN) clm(.X, .Y, .ClmulResult);
+
+  // choose result = rev(X @ Y) for clmulh/clmulr
+  bitreverse #(P.XLEN) brClmulResult(ClmulResult, RevClmulResult);
+  mux2 #(P.XLEN) zbcresultmux(ClmulResult, RevClmulResult, Funct3[1], ZBCResult);
 endmodule
diff --git a/src/ieu/controller.sv b/src/ieu/controller.sv
index 005baa56b..f8d3e2122 100644
--- a/src/ieu/controller.sv
+++ b/src/ieu/controller.sv
@@ -175,7 +175,9 @@ module controller import cvw::*;  #(parameter cvw_t P) (
   // Be rigorous about detecting illegal instructions if CSRs or bit manipulation or conditional ops are supported
   // otherwise be cheap
 
-  if (P.ZICSR_SUPPORTED | P.ZBA_SUPPORTED | P.ZBB_SUPPORTED | P.ZBC_SUPPORTED | P.ZBS_SUPPORTED | P.ZICOND_SUPPORTED) begin:legalcheck // Exact integer decoding
+  if (P.ZICSR_SUPPORTED | P.ZBA_SUPPORTED  | P.ZBB_SUPPORTED  | P.ZBC_SUPPORTED  | P.ZBS_SUPPORTED | 
+      P.ZBKB_SUPPORTED  | P.ZBKC_SUPPORTED | P.ZBKX_SUPPORTED | P.ZKNE_SUPPORTED | 
+      P.ZKND_SUPPORTED  | P.ZKNH_SUPPORTED | P.ZICOND_SUPPORTED) begin:legalcheck // Exact integer decoding
     logic Funct7ZeroD, Funct7b5D, IShiftD, INoShiftD;
     logic Funct7ShiftZeroD, Funct7Shiftb5D;
 
@@ -318,7 +320,9 @@ module controller import cvw::*;  #(parameter cvw_t P) (
   assign BaseSubArithD = ALUOpD & (subD | sraD | sltD | sltuD);
 
   // bit manipulation Configuration Block
-  if (P.ZBS_SUPPORTED | P.ZBA_SUPPORTED | P.ZBB_SUPPORTED | P.ZBC_SUPPORTED) begin: bitmanipi //change the conditional expression to OR any Z supported flags
+  if (P.ZBS_SUPPORTED  | P.ZBA_SUPPORTED  | P.ZBB_SUPPORTED  | P.ZBC_SUPPORTED |
+      P.ZBKB_SUPPORTED | P.ZBKC_SUPPORTED | P.ZBKX_SUPPORTED | P.ZKNE_SUPPORTED | 
+      P.ZKND_SUPPORTED | P.ZKNH_SUPPORTED) begin: bitmanipi 
     logic IllegalBitmanipInstrD;          // Unrecognized B instruction
     logic BRegWriteD;                     // Indicates if it is a R type BMU instruction in decode stage
     logic BW64D;                          // Indicates if it is a W type BMU instruction in decode stage
diff --git a/src/ieu/shifter.sv b/src/ieu/shifter.sv
index af44b6136..52d87bb1d 100644
--- a/src/ieu/shifter.sv
+++ b/src/ieu/shifter.sv
@@ -40,7 +40,7 @@ module shifter import cvw::*; #(parameter cvw_t P) (
 
   assign Sign = A[P.XLEN-1] & SubArith;  // sign bit for sign extension
   if (P.XLEN==32) begin // rv32
-    if (P.ZBB_SUPPORTED) begin: rotfunnel32 //rv32 shifter with rotates
+    if (P.ZBB_SUPPORTED | P.ZBKB_SUPPORTED) begin: rotfunnel32 //rv32 shifter with rotates
       always_comb  // funnel mux
         case({Right, Rotate})
           2'b00: Z = {A[31:0], 31'b0};
@@ -57,7 +57,7 @@ module shifter import cvw::*; #(parameter cvw_t P) (
   end else begin // rv64
     logic [P.XLEN-1:0]         A64;                            
     mux3 #(64) extendmux({{32{1'b0}}, A[31:0]}, {{32{A[31]}}, A[31:0]}, A, {~W64, SubArith}, A64); // bottom 32 bits are always A[31:0], so effectively a 32-bit upper mux
-    if (P.ZBB_SUPPORTED) begin: rotfunnel64 // rv64 shifter with rotates
+    if (P.ZBB_SUPPORTED | P.ZBKB_SUPPORTED) begin: rotfunnel64 // rv64 shifter with rotates
       // shifter rotate source select mux
       logic [P.XLEN-1:0]   RotA;                          // rotate source
       mux2 #(P.XLEN) rotmux(A, {A[31:0], A[31:0]}, W64, RotA); // W64 rotatons