Merge pull request #810 from davidharrishmc/dev

Zk simplification
2025-02-11 06:05:49 +00:00 · 2024-05-25 11:52:56 -05:00 · 2024-05-25 11:52:56 -05:00 · 153e66c4bb
commit 153e66c4bb
parent 5b7b23fd64 cfe83f5b49
26 changed files with 582 additions and 160 deletions
--- a/.gitignore
+++ b/.gitignore
@ -232,6 +232,7 @@ examples/verilog/fulladder/simprofile_dir/
 examples/verilog/fulladder/simv.daidir/
 examples/verilog/fulladder/ucli.key
 examples/verilog/fulladder/verdi_config_file
+examples/crypto/gfmul/gfmul
 tests/functcov
 tests/functcov/*
 tests/functcov/*/*
--- a/bin/wally-tool-chain-install.sh
+++ b/bin/wally-tool-chain-install.sh
@ -48,7 +48,8 @@ sudo apt update -y
 sudo apt upgrade -y
 sudo apt install -y git gawk make texinfo bison flex build-essential python3 libz-dev libexpat-dev autoconf device-tree-compiler ninja-build libpixman-1-dev ncurses-base ncurses-bin libncurses5-dev dialog curl wget ftp libgmp-dev libglib2.0-dev python3-pip pkg-config opam z3 zlib1g-dev automake autotools-dev libmpc-dev libmpfr-dev  gperf libtool patchutils bc mutt ssmtp
 # Other python libraries used through the book.
-sudo -H pip3 install sphinx sphinx_rtd_theme matplotlib scipy scikit-learn adjustText lief markdown pyyaml
+sudo -H pip3 install sphinx sphinx_rtd_theme matplotlib scipy scikit-learn adjustText lief markdown pyyaml 
+sudo -H pip3 install riscv_isac # to generate new tests, such as quads with fp_dataset.py

 # needed for Ubuntu 22.04, gcc cross compiler expects python not python2 or python3.
 if ! command -v python &> /dev/null
--- a/config/derivlist.txt
+++ b/config/derivlist.txt
@ -530,6 +530,246 @@ ZALRSC_SUPPORTED    0
 deriv zalrsc_rv64gc rv64gc
 ZAAMO_SUPPORTED     0

+deriv zba_rv32gc rv32gc
+ZBA_SUPPORTED     1     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbb_rv32gc rv32gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     1     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbc_rv32gc rv32gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     1     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbs_rv32gc rv32gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     1     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbkb_rv32gc rv32gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     1     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbkc_rv32gc rv32gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     1     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbkx_rv32gc rv32gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     1     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zknd_rv32gc rv32gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     1     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zkne_rv32gc rv32gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     1     
+ZKNH_SUPPORTED     0     
+
+deriv zknh_rv32gc rv32gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     1     
+
+deriv zba_rv64gc rv64gc
+ZBA_SUPPORTED     1     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbb_rv64gc rv64gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     1     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbc_rv64gc rv64gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     1     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbs_rv64gc rv64gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     1     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbkb_rv64gc rv64gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     1     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbkc_rv64gc rv64gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     1     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zbkx_rv64gc rv64gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     1     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zknd_rv64gc rv64gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     1     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     0     
+
+deriv zkne_rv64gc rv64gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     1     
+ZKNH_SUPPORTED     0     
+
+deriv zknh_rv64gc rv64gc
+ZBA_SUPPORTED     0     
+ZBB_SUPPORTED     0     
+ZBS_SUPPORTED     0     
+ZBC_SUPPORTED     0     
+ZBKB_SUPPORTED     0     
+ZBKC_SUPPORTED     0     
+ZBKX_SUPPORTED     0     
+ZKND_SUPPORTED     0     
+ZKNE_SUPPORTED     0     
+ZKNH_SUPPORTED     1     
+
 # Floating-point modes supported

 deriv f_rv32gc rv32gc
--- a/examples/crypto/gfmul/Makefile
+++ b/examples/crypto/gfmul/Makefile
@ -0,0 +1,16 @@
+# Makefile
+
+CC     = gcc
+CFLAGS = -O3
+LIBS   = 
+SRCS   = $(wildcard *.c)
+
+PROGS = $(patsubst %.c,%,$(SRCS))
+
+all:	$(PROGS)
+
+%: %.c
+	$(CC) $(CFLAGS) $(IFLAGS) -o $@ $< $(LIBS)
+
+clean: 
+	rm -f $(PROGS)
--- a/examples/crypto/gfmul/gfmul.c
+++ b/examples/crypto/gfmul/gfmul.c
@ -0,0 +1,72 @@
+// gfmul.c - Galois Field multiplication
+// James Stine and David Harris 16 May 2024
+
+#include <stdio.h>
+
+/* return ab mod m(x) - long multiplication in GF(2^n) with polynomial m */
+int gfmul(int a, int b, int n, int m) {
+   int result = 0;
+   while (b) {
+     if (b & 1) result = result ^ a; /* if bit of b is set add a */
+     a = a << 1;                     /* multiply a by x */
+     if (a & 1 << n)
+       a = a ^ m;                    /* reduce/sub modulo AES m(x) = 100011011 */
+     //printf("a = %x, b = %x, result = %x\n", a, b, result);
+     b = b >> 1;                     /* get next bit of b */
+   }
+   return result;
+}
+
+void inverses(void) {
+    int i, j, k, num;
+
+    printf("\nTable of inverses in GF(2^8) with polynomial m(x) = 100011011\n");
+    for (i=0; i<16; i++) {
+        for (j=0; j<16; j++) {
+            num = i*16+j;
+            if (num ==0) printf ("00 ");
+            else for (k=1; k<256; k++) {
+                if (gfmul(num, k, 8, 0b100011011) == 1) {
+                    printf("%02x ", k);
+                    break;
+                }
+            }
+        }
+        printf("\n");
+    }
+}
+
+void inverses3(void) {
+    int k, num;
+
+    printf("\nTable of inverses in GF(2^8) with polynomial m(x) = 100011011\n");
+    for (num=0; num<8; num++) {
+        if (num == 0) printf ("0 ");
+        else for (k=1; k<8; k++) {
+            if (gfmul(num, k, 3, 0b1011) == 1) {
+                printf("%d ", k);
+                break;
+            }
+        }
+    }
+    printf("\n");
+}
+
+
+int main() {
+  int a = 0xC5;
+  int b = 0xA1;
+
+  printf("The GF(2^8) result is %x\n", gfmul(a,b, 8, 0b100011011));
+  printf("The GF(2^8) result is %x\n", gfmul(0xC1, 0x28, 8, 0b100011011));
+  inverses();
+
+  // tabulate inverses for GF(2^3)
+  inverses3();
+  // check worked examples
+    printf("The GF(2^3) result is %d\n", gfmul(0b101,0b011, 3, 0b1011));
+    printf("The GF(2^3) result is %d\n", gfmul(0b101,0b010, 3, 0b1011));
+    printf("The GF(2^3) result is %d\n", gfmul(0b101,0b100, 3, 0b1011));
+    printf("The GF(2^3) result is %d\n", gfmul(0b101,0b011, 3, 0b1011));
+ 
+}
--- a/src/ieu/aes/aes32d.sv
+++ b/src/ieu/aes/aes32d.sv
@ -34,8 +34,8 @@ module aes32d(
   logic [7:0] 			  SboxOut;
   logic [31:0] 		     so, mixed;
   
-   aesinvsbox8 inv_sbox(SboxIn, SboxOut);         // Apply inverse sbox to si
-   assign so = {24'h0, SboxOut};                  // Pad output of inverse substitution box
-   aesinvmixcolumns32 mix(so, mixed);             // Run so through the mixword AES function
+   aesinvsbox8 inv_sbox(SboxIn, SboxOut);          // Apply inverse sbox to si
+   aesinvmixcolumns8 mix(SboxOut, mixed);          // Run so through the InvMixColumns AES function
+   assign so = {24'h0, SboxOut};                   // Pad output of inverse substitution box
   mux2 #(32) rmux(mixed, so, finalround, result); // on final round, skip mixcolumns
 endmodule
--- a/src/ieu/aes/aes32e.sv
+++ b/src/ieu/aes/aes32e.sv
@ -34,8 +34,8 @@ module aes32e(
   logic [7:0] 			  SboxOut;
   logic [31:0] 		     so, mixed;
   
-   aessbox8 sbox(SboxIn, SboxOut);                // Substitute
-   assign so = {24'h0, SboxOut};                  // Pad sbox output
-   aesmixcolumns32 mwd(so, mixed);                // Mix Word using aesmixword component
-   mux2 #(32) rmux(mixed, so, finalround, result); // on final round, skip mixcolumns
+   aessbox8 sbox(SboxIn, SboxOut);                 // Substitute
+   assign so = {24'h0, SboxOut};                   // Pad sbox output
+   aesmixcolumns32 mb(so, mixed);                  // Mix using MixColumns component
+   mux2 #(32) rmux(mixed, so, finalround, result); // on final round, skip MixColumns
 endmodule
--- a/src/ieu/aes/aes64d.sv
+++ b/src/ieu/aes/aes64d.sv
@ -32,20 +32,20 @@ module aes64d(
   output logic [63:0] result
 );
   
-   logic [63:0] 		    ShiftRowOut, SboxOut, MixcolIn, MixcolOut;
+   logic [63:0] 		    ShiftRowsOut, SboxOut, MixcolsIn, MixcolsOut;
   
   // Apply inverse shiftrows to rs2 and rs1
-   aesinvshiftrow64 srow({rs2, rs1}, ShiftRowOut);
+   aesinvshiftrows64 srow({rs2, rs1}, ShiftRowsOut);
   
   // Apply full word inverse substitution to lower doubleord of shiftrow out
-   aesinvsbox64 invsbox(ShiftRowOut,  SboxOut);
+   aesinvsbox64 invsbox(ShiftRowsOut,  SboxOut);
   
-   mux2 #(64) mixcolmux(SboxOut, rs1, aes64im, MixcolIn);
+   mux2 #(64) mixcolmux(SboxOut, rs1, aes64im, MixcolsIn);
   
-   // Apply inverse mixword to sbox outputs
-   aesinvmixcolumns32 invmw0(MixcolIn[31:0], MixcolOut[31:0]);
-   aesinvmixcolumns32 invmw1(MixcolIn[63:32], MixcolOut[63:32]);
+   // Apply inverse MixColumns to sbox outputs
+   aesinvmixcolumns32 invmw0(MixcolsIn[31:0], MixcolsOut[31:0]);
+   aesinvmixcolumns32 invmw1(MixcolsIn[63:32], MixcolsOut[63:32]);
   
   // Final round skips mixcolumns.
-   mux2 #(64) resultmux(MixcolOut, SboxOut, finalround, result);
+   mux2 #(64) resultmux(MixcolsOut, SboxOut, finalround, result);
 endmodule
--- a/src/ieu/aes/aes64e.sv
+++ b/src/ieu/aes/aes64e.sv
@ -34,22 +34,22 @@ module aes64e(
    output logic [63:0] result
 );
  
-    logic [63:0]  ShiftRowOut, SboxOut, MixcolOut;
+    logic [63:0]  ShiftRowsOut, SboxOut, MixcolsOut;
                
    // AES shiftrow unit
-    aesshiftrow64 srow({rs2,rs1}, ShiftRowOut);
+    aesshiftrows64 srow({rs2,rs1}, ShiftRowsOut);
   
    // Apply substitution box to 2 lower words
    // Use the shared sbox in zknde64.sv for the first sbox
-    assign SboxEIn = ShiftRowOut[31:0];
+    assign SboxEIn = ShiftRowsOut[31:0];
    assign SboxOut[31:0] = Sbox0Out;

-    aessbox32 sbox1(ShiftRowOut[63:32], SboxOut[63:32]); // instantiate second sbox
+    aessbox32 sbox1(ShiftRowsOut[63:32], SboxOut[63:32]); // instantiate second sbox

-    // Apply mix columns operations
-    aesmixcolumns32 mw0(SboxOut[31:0],  MixcolOut[31:0]);
-    aesmixcolumns32 mw1(SboxOut[63:32], MixcolOut[63:32]);    
+    // Apply MixColumns operations
+    aesmixcolumns32 mw0(SboxOut[31:0],  MixcolsOut[31:0]);
+    aesmixcolumns32 mw1(SboxOut[63:32], MixcolsOut[63:32]);

    // Skip mixcolumns on last round
-    mux2 #(64) resultmux(MixcolOut, SboxOut, finalround, result);
+    mux2 #(64) resultmux(MixcolsOut, SboxOut, finalround, result);
 endmodule
--- a/src/ieu/aes/aesinvmixcolumns8.sv
+++ b/src/ieu/aes/aesinvmixcolumns8.sv
@ -0,0 +1,47 @@
+///////////////////////////////////////////
+// aesinvmixcolumns8.sv
+//
+// Written: kelvin.tran@okstate.edu, james.stine@okstate.edu
+// Created: 05 March 2024
+//
+// Purpose: AES Inverted Mix Column Function for use with AES
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-24 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module aesinvmixcolumns8(
+   input  logic [7:0] a, 
+   output logic [31:0] y
+);
+
+   logic [10:0] t, x0, x1, x2, x3;
+
+   // aes32d operates on shifted versions of the input
+   assign t  = {a, 3'b0} ^ {3'b0, a};
+   assign x0 = {a, 3'b0} ^ {1'b0, a, 2'b0} ^ {2'b0, a, 1'b0};
+   assign x1 = t;
+   assign x2 = t ^ {1'b0, a, 2'b0};
+   assign x3 = t ^ {2'b0, a, 1'b0};
+
+   galoismultinverse8 gm0 (x0, y[7:0]);
+   galoismultinverse8 gm1 (x1, y[15:8]);
+   galoismultinverse8 gm2 (x2, y[23:16]);
+   galoismultinverse8 gm3 (x3, y[31:24]);
+
+ endmodule 
--- a/src/ieu/aes/aesinvshiftrows64.sv
+++ b/src/ieu/aes/aesinvshiftrows64.sv
@ -1,5 +1,5 @@
 ///////////////////////////////////////////
-// aesinvshiftrow.sv
+// aesinvshiftrows64.sv
 //
 // Written: ryan.swann@okstate.edu, james.stine@okstate.edu
 // Created: 20 February 2024
@ -25,9 +25,9 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////

-module aesinvshiftrow64(
+module aesinvshiftrows64(
   input  logic [127:0] a, 
-   output logic [63:0] y
+   output logic [63:0]  y
 );

   assign y = {a[95:88],   a[119:112], a[15:8],    a[39:32],
--- a/src/ieu/aes/aesmixcolumns8.sv
+++ b/src/ieu/aes/aesmixcolumns8.sv
@ -0,0 +1,39 @@
+///////////////////////////////////////////
+// aesmixcolumns8.sv
+//
+// Written: ryan.swann@okstate.edu, james.stine@okstate.edu, David_Harris@hmc.edu
+// Created: 20 February 2024
+//
+// Purpose: Galois field operation to byte in an individual 32-bit word
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-24 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+module aesmixcolumns8(
+   input  logic [7:0]  a, 
+   output logic [31:0] y
+);
+
+   logic [7:0] xa, xapa;
+
+   galoismultforward8 gm(a, xa); // xa
+   assign xapa = a ^ xa;         // a ^ xa
+   assign y = {xapa, a, a, xa};
+endmodule
--- a/src/ieu/aes/aesshiftrows64.sv
+++ b/src/ieu/aes/aesshiftrows64.sv
@ -1,5 +1,5 @@
 ///////////////////////////////////////////
-// aesshiftrow.sv
+// aesshiftrows64.sv
 //
 // Written: ryan.swann@okstate.edu, james.stine@okstate.edu
 // Created: 20 February 2024
@ -25,7 +25,7 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////

-module aesshiftrow64(
+module aesshiftrows64(
   input  logic [127:0] a, 
   output logic [63:0] y
 );
--- a/src/ieu/aes/aesshiftrows64.xv
+++ b/src/ieu/aes/aesshiftrows64.xv
@ -0,0 +1,35 @@
+///////////////////////////////////////////
+// aesshiftrows64.sv
+//
+// Written: ryan.swann@okstate.edu, james.stine@okstate.edu
+// Created: 20 February 2024
+//
+// Purpose: aesshiftrow for taking in first Data line
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-24 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module aesshiftrows64(
+   input  logic [127:0] a, 
+   output logic [63:0]  y
+);
+		    
+   assign y = {a[31:24],   a[119:112], a[79:72],   a[39:32],
+               a[127:120], a[87:80],   a[47:40],   a[7:0]};   
+endmodule
--- a/src/ieu/alu.sv
+++ b/src/ieu/alu.sv
@ -60,7 +60,22 @@ module alu import cvw::*; #(parameter cvw_t P) (
  // CondShiftA is A for add/sub or a shifted version of A for shift-and-add BMU instructions
  assign CondMaskInvB = SubArith ? ~CondMaskB : CondMaskB;
  assign {Carry, Sum} = CondShiftA + CondMaskInvB + {{(P.XLEN-1){1'b0}}, SubArith};
-  
+
+  // Zicond block conditionally zeros B
+  if (P.ZICOND_SUPPORTED) begin: zicond
+    logic  BZero;
+    
+    assign BZero = (B == 0); // check if rs2 = 0
+    // Create a signal that is 0 when czero.* instruction should clear result
+    // If B = 0 for czero.eqz or if B != 0 for czero.nez
+    always_comb 
+     case (CZero)
+        2'b01:   ZeroCondMaskInvB = {P.XLEN{~BZero}}; // czero.eqz: kill if B = 0
+        2'b10:   ZeroCondMaskInvB = {P.XLEN{BZero}};  // czero.nez: kill if B != 0
+        default: ZeroCondMaskInvB = CondMaskInvB;     // otherwise normal behavior
+      endcase
+  end else assign ZeroCondMaskInvB = CondMaskInvB; // no masking if Zicond is not supported
+
  // Shifts (configurable for rotation)
  shifter #(P) sh(.A, .Amt(B[P.LOG_XLEN-1:0]), .Right(Funct3[2]), .W64, .SubArith, .Y(Shift), .Rotate(BALUControl[2]));

@ -105,18 +120,4 @@ module alu import cvw::*; #(parameter cvw_t P) (
    assign CondShiftA = A;
  end

-  // Zicond block
-  if (P.ZICOND_SUPPORTED) begin: zicond
-    logic  BZero;
-    
-    assign BZero = (B == 0); // check if rs2 = 0
-    // Create a signal that is 0 when czero.* instruction should clear result
-    // If B = 0 for czero.eqz or if B != 0 for czero.nez
-    always_comb 
-     case (CZero)
-        2'b01:   ZeroCondMaskInvB = {P.XLEN{~BZero}}; // czero.eqz: kill if B = 0
-        2'b10:   ZeroCondMaskInvB = {P.XLEN{BZero}};  // czero.nez: kill if B != 0
-        default: ZeroCondMaskInvB = CondMaskInvB;     // otherwise normal behavior
-      endcase
-  end else assign ZeroCondMaskInvB = CondMaskInvB; // no masking if Zicond is not supported
 endmodule
--- a/src/ieu/bmu/bitmanipalu.sv
+++ b/src/ieu/bmu/bitmanipalu.sv
@ -103,18 +103,18 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P) (

  // ZBKB Unit
  if (P.ZBKB_SUPPORTED) begin: zbkb
-    zbkb #(P.XLEN) ZBKB(.A(ABMU), .B(BBMU), .RevA, .W64, .Funct3, .ZBKBSelect(ZBBSelect[2:0]), .ZBKBResult);
+    zbkb #(P.XLEN) ZBKB(.A(ABMU), .B(BBMU), .Funct3, .ZBKBSelect(ZBBSelect[2:0]), .ZBKBResult);
  end else assign ZBKBResult = '0;

  // ZBKX Unit
  if (P.ZBKX_SUPPORTED) begin: zbkx
-    zbkx #(P.XLEN) ZBKX(.A(ABMU), .B(BBMU), .ZBKXSelect(ZBBSelect[2:0]), .ZBKXResult);
+    zbkx #(P.XLEN) ZBKX(.A(ABMU), .B(BBMU), .ZBKXSelect(ZBBSelect[0]), .ZBKXResult);
  end else assign ZBKXResult = '0;

  // ZKND and ZKNE AES decryption and encryption
  if (P.ZKND_SUPPORTED | P.ZKNE_SUPPORTED) begin: zknde
-    if (P.XLEN == 32) zknde32 #(P) ZKN32(.A(ABMU), .B(BBMU), .Funct7, .round(Rs2E[3:0]), .ZKNSelect(ZBBSelect[3:0]), .ZKNDEResult); 
-    else              zknde64 #(P) ZKN64(.A(ABMU), .B(BBMU), .Funct7, .round(Rs2E[3:0]), .ZKNSelect(ZBBSelect[3:0]), .ZKNDEResult); 
+    if (P.XLEN == 32) zknde32 #(P) ZKN32(.A(ABMU), .B(BBMU), .bs(Funct7[6:5]), .round(Rs2E[3:0]), .ZKNSelect(ZBBSelect[3:0]), .ZKNDEResult); 
+    else              zknde64 #(P) ZKN64(.A(ABMU), .B(BBMU),                   .round(Rs2E[3:0]), .ZKNSelect(ZBBSelect[3:0]), .ZKNDEResult); 
  end else assign ZKNDEResult = '0;
 
  // ZKNH Unit
--- a/src/ieu/bmu/bmuctrl.sv
+++ b/src/ieu/bmu/bmuctrl.sv
@ -264,7 +264,7 @@ module bmuctrl import cvw::*;  #(parameter cvw_t P) (
          17'b0110011_0101111_000:     BMUControlsD = `BMUCTRLW'b000_1000_1010_1_0_0_1_0_0_0_0_0;  // sha512sig1h
          17'b0110011_0101011_000:     BMUControlsD = `BMUCTRLW'b000_1000_1011_1_0_0_1_0_0_0_0_0;  // sha512sig1l
          17'b0110011_0101000_000:     BMUControlsD = `BMUCTRLW'b000_1000_1100_1_0_0_1_0_0_0_0_0;  // sha512sum0r
-          17'b0110011_0101001_000:     BMUControlsD = `BMUCTRLW'b000_1000_1101_1_0_0_1_0_0_0_0_0;  // sha512sum1r
+          17'b0110011_0101001_000:     BMUControlsD = `BMUCTRLW'b000_1000_1110_1_0_0_1_0_0_0_0_0;  // sha512sum1r
        endcase

      else if (P.XLEN==64)
--- a/src/ieu/bmu/byteop.sv
+++ b/src/ieu/bmu/byteop.sv
@ -30,24 +30,16 @@

 module byteop #(parameter WIDTH=32) (
  input  logic [WIDTH-1:0] A,             // Operands
-  input  logic [WIDTH-1:0] RevA,          // Reversed A
-  input  logic [1:0]       ByteSelect,    // LSB of Immediate
+  input  logic             ByteSelect,    // LSB of Immediate
  output logic [WIDTH-1:0] ByteResult);   // rev8, orcb result

-  logic [WIDTH-1:0] OrcBResult, Rev8Result, Brev8Result;
+  logic [WIDTH-1:0] OrcBResult, Rev8Result;
  genvar i;

-  for (i=0;i<WIDTH;i+=8) begin:loop
+  for (i=0;i<WIDTH;i+=8) begin:byteloop
    assign OrcBResult[i+7:i] = {8{|A[i+7:i]}};
    assign Rev8Result[WIDTH-i-1:WIDTH-i-8] = A[i+7:i];
-    assign Brev8Result[i+7:i] = RevA[WIDTH-1-i:WIDTH-i-8];
  end

-  // ByteOp Result Mux
-  always_comb begin
-    if (ByteSelect[0] == 1'b0)      ByteResult = Rev8Result;
-    else if (ByteSelect[1] == 1'b0) ByteResult = OrcBResult;
-    else                            ByteResult = Brev8Result;
-  end
-  
+  mux2 #(WIDTH) byteresultmux(Rev8Result, OrcBResult, ByteSelect, ByteResult);
 endmodule
--- a/src/ieu/bmu/zbb.sv
+++ b/src/ieu/bmu/zbb.sv
@ -45,7 +45,7 @@ module zbb #(parameter WIDTH=32) (

  mux2 #(1) ltmux(LT, LTU, BUnsigned , lt);
  cnt #(WIDTH) cnt(.A, .RevA, .B(B[1:0]), .W64, .CntResult);
-  byteop #(WIDTH) bu(.A, .RevA, .ByteSelect({B[10], B[0]}), .ByteResult);
+  byteop #(WIDTH) bu(.A, .ByteSelect(B[0]), .ByteResult);
  ext #(WIDTH) ext(.A, .ExtSelect({~B[2], {B[2] & B[0]}}), .ExtResult);

  // ZBBSelect[2] differentiates between min(u) vs max(u) instruction
--- a/src/ieu/kmu/zbkb.sv
+++ b/src/ieu/kmu/zbkb.sv
@ -26,21 +26,25 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module zbkb #(parameter WIDTH=32) (
-   input  logic [WIDTH-1:0] A, B, RevA,
-   input  logic 	          W64,
-   input  logic [2:0] 	    Funct3,
+   input  logic [WIDTH-1:0] A, B,
+  input  logic [2:0] 	    Funct3,
   input  logic [2:0] 	    ZBKBSelect,
   output logic [WIDTH-1:0] ZBKBResult
 );
   
-   logic [WIDTH-1:0] 	     ByteResult;   // rev8, brev8
+   logic [WIDTH-1:0] 	     Brev8Result;  // rev8, brev8
   logic [WIDTH-1:0] 	     PackResult;   // pack, packh, packw (RB64 only)
   logic [WIDTH-1:0] 	     ZipResult;    // zip, unzip
+
+   // brev8 just uses wires
+   genvar i, j;
+   for (i=0;i<WIDTH/8;i=i+1) 
+      for (j=0; j<8; j=j+1) 
+         assign Brev8Result[i*8+j] = A[i*8+7-j];
   
-   byteop #(WIDTH) rev(.A, .RevA, .ByteSelect({B[10], B[0]}), .ByteResult);
   packer #(WIDTH) pack(.A, .B, .PackSelect({ZBKBSelect[2], Funct3[1:0]}), .PackResult);
   zipper #(WIDTH) zip(.A, .ZipSelect(Funct3[2]), .ZipResult);
   
   // ZBKB Result Select Mux
-   mux3 #(WIDTH) zbkbresultmux(ByteResult, PackResult, ZipResult, ZBKBSelect[1:0], ZBKBResult);   
+   mux3 #(WIDTH) zbkbresultmux(Brev8Result, PackResult, ZipResult, ZBKBSelect[1:0], ZBKBResult);   
 endmodule
--- a/src/ieu/kmu/zbkx.sv
+++ b/src/ieu/kmu/zbkx.sv
@ -27,7 +27,7 @@

 module zbkx #(parameter WIDTH=32) (
   input  logic [WIDTH-1:0] A, B,
-   input  logic [2:0] 	    ZBKXSelect,
+   input  logic  	          ZBKXSelect,
   output logic [WIDTH-1:0] ZBKXResult
 );
   
@ -46,5 +46,5 @@ module zbkx #(parameter WIDTH=32) (
      end   
   end

-   assign ZBKXResult = ZBKXSelect[0] ? xperm4 : xperm8;
+   assign ZBKXResult = ZBKXSelect ? xperm4 : xperm8;
 endmodule
--- a/src/ieu/kmu/zknde32.sv
+++ b/src/ieu/kmu/zknde32.sv
@ -28,7 +28,7 @@

 module zknde32 import cvw::*; #(parameter cvw_t P) (
   input  logic [31:0] A, B,
-   input  logic [6:0]  Funct7,
+   input  logic [1:0]  bs,
   input  logic [3:0]  round,
   input  logic [3:0]  ZKNSelect,
   output logic [31:0] ZKNDEResult
@ -39,7 +39,7 @@ module zknde32 import cvw::*; #(parameter cvw_t P) (
    logic [31:0]    ZKNEResult, ZKNDResult, rotin, rotout;             

    // Initial shamt and Sbox input selection steps shared between encrypt and decrypt
-    assign shamt = {Funct7[6:5], 3'b0};          // shamt = bs * 8 (convert bytes to bits)
+    assign shamt = {bs, 3'b0};          // shamt = bs * 8 (convert bytes to bits)
    assign SboxIn = B[shamt +: 8];               // select byte bs of rs2

    // Handle logic specific to encrypt or decrypt
@ -55,6 +55,7 @@ module zknde32 import cvw::*; #(parameter cvw_t P) (
        assign rotin = ZKNEResult;

    // final rotate and XOR steps shared between encrypt and decrypt
-    rotate #(32) mrot(rotin, shamt, rotout);       // Rotate the mixcolumns output left by shamt (bs * 8)
+    mux4 #(32) mrotmux(rotin, {rotin[23:0], rotin[31:24]}, 
+                       {rotin[15:0], rotin[31:16]}, {rotin[7:0], rotin[31:8]},  bs, rotout); // Rotate the mixcolumns output left by shamt (bs * 8)
    assign ZKNDEResult = A ^ rotout;               // xor with running value (A = rs1)
 endmodule
--- a/src/ieu/kmu/zknde64.sv
+++ b/src/ieu/kmu/zknde64.sv
@ -28,7 +28,6 @@

 module zknde64 import cvw::*; #(parameter cvw_t P) (
   input  logic [63:0] A, B,
-   input  logic [6:0]  Funct7,
   input  logic [3:0]  round,
   input  logic [3:0]  ZKNSelect,
   output logic [63:0] ZKNDEResult
@ -39,11 +38,13 @@ module zknde64 import cvw::*; #(parameter cvw_t P) (
   
    if (P.ZKND_SUPPORTED) // ZKND supports aes64ds, aes64dsm, aes64im
        aes64d    aes64d(.rs1(A), .rs2(B), .finalround(ZKNSelect[2]), .aes64im(ZKNSelect[3]), .result(aes64dRes)); // decode AES
-    if (P.ZKNE_SUPPORTED) // ZKNE supports aes64es, aes64esm
+    if (P.ZKNE_SUPPORTED) begin // ZKNE supports aes64es, aes64esm
        aes64e    aes64e(.rs1(A), .rs2(B), .finalround(ZKNSelect[2]), .Sbox0Out, .SboxEIn, .result(aes64eRes));
+        mux2 #(32) sboxmux(SboxEIn, SboxKIn, ZKNSelect[1], Sbox0In);
+    end else    
+        assign Sbox0In = SboxKIn;

    // One S Box is always needed for aes64ks1i and is also needed for aes64e if that is supported.  Put it at the top level to allow sharing
-    mux2 #(32) sboxmux(SboxEIn, SboxKIn, ZKNSelect[1], Sbox0In);
    aessbox32 sbox(Sbox0In, Sbox0Out);                       // Substitute bytes of value obtained for tmp2 using Rijndael sbox

    // Both ZKND and ZKNE support aes64ks1i and aes64ks2 instructions
--- a/src/ieu/sha/sha256.sv
+++ b/src/ieu/sha/sha256.sv
@ -37,29 +37,29 @@ module sha256 (
   // sha256{sig0/sig1/sum0/sum1} select shifted operands for 32-bit xor3 and then sign-extend

   // sha256sig0
-   assign x[0][0] = {A[6:0], A[31:7]};
-   assign x[0][1] = {A[17:0], A[31:18]};
-   assign x[0][2] = {3'b0, A[31:3]};
+   assign x[0][0] = {A[6:0], A[31:7]};    // ror 7
+   assign x[0][1] = {A[17:0], A[31:18]};  // ror 18
+   assign x[0][2] = {3'b0, A[31:3]};      // >> 3

   // sha256sig1
-   assign x[1][0] = {A[16:0], A[31:17]};
-   assign x[1][1] = {A[18:0], A[31:19]};
-   assign x[1][2] = {10'b0, A[31:10]};
+   assign x[1][0] = {A[16:0], A[31:17]};  // ror 17
+   assign x[1][1] = {A[18:0], A[31:19]};  // ror 19
+   assign x[1][2] = {10'b0, A[31:10]};    // >> 10

   // sha256sum0
-   assign x[2][0] = {A[1:0],  A[31:2]};
-   assign x[2][1] = {A[12:0], A[31:13]};
-   assign x[2][2] = {A[21:0], A[31:22]};
+   assign x[2][0] = {A[1:0],  A[31:2]};   // ror 2
+   assign x[2][1] = {A[12:0], A[31:13]};  // ror 13
+   assign x[2][2] = {A[21:0], A[31:22]};  // ror 22

   // sha256sum1
-   assign x[3][0] = {A[5:0], A[31:6]};
-   assign x[3][1] ={A[10:0], A[31:11]};
-   assign x[3][2] = {A[24:0], A[31:25]};
+   assign x[3][0] = {A[5:0], A[31:6]};    // ror 6
+   assign x[3][1] ={ A[10:0], A[31:11]};  // ror 11
+   assign x[3][2] = {A[24:0], A[31:25]};  // ror 25

   // 32-bit muxes to select inputs to xor3 for sha256 
-   assign y[0] = x[ZKNHSelect[1:0]][0]; 
-   assign y[1] = x[ZKNHSelect[1:0]][1]; 
-   assign y[2] = x[ZKNHSelect[1:0]][2]; 
+   assign y[0] = x[ZKNHSelect[1:0]][0];
+   assign y[1] = x[ZKNHSelect[1:0]][1];
+   assign y[2] = x[ZKNHSelect[1:0]][2];

   // sha256 32-bit xor3
   assign result = y[0] ^ y[1] ^ y[2];
--- a/src/ieu/sha/sha512_32.sv
+++ b/src/ieu/sha/sha512_32.sv
@ -31,67 +31,39 @@ module sha512_32 (
   output logic [31:0] result
 );

-   logic [31:0] x[6][6];
-   logic [31:0] y[6];
+   logic [31:0] x[4][3];
+   logic [31:0] y[3];

-   // sha512{sig0h/sig0l/sig1h/sig1l/sum0r/sum1r} select shifted operands for 32-bit xor6
+   // rotate/shift a 64-bit value contained in {B, A} and select 32 bits
+   // sha512{sig0h/sig0l/sig1h/sig1l/sum0r/sum1r} select shifted operands for 32-bit xor

-   // sha512sig0h
-   assign x[0][0] = A >> 1;
-   assign x[0][1] = A >> 7;
-   assign x[0][2] = A >> 8;
-   assign x[0][3] = B << 31;
-   assign x[0][4] = B << 24;
-   assign x[0][5] = '0;   
+   // The l flavors differ from h by using low bits of B instead of zeros in x[0/1][2]

-   // sha512sig0l
-   assign x[1][0] = A >> 1; 
-   assign x[1][1] = A >> 7; 
-   assign x[1][2] = A >> 8; 
-   assign x[1][3] = B << 31;
-   assign x[1][4] = B << 25;
-   assign x[1][5] = B << 24;
+   // sha512sig0h/l
+   assign x[0][0] = {B[0], A[31:1]};                           // ror 1
+   assign x[0][1] = {B[7:0], A[31:8]};                         // ror 8
+   assign x[0][2] = {B[6:0] & {7{ZKNHSelect[0]}}, A[31:7]};    // ror/srl 7

-   // sha512sig1h
-   assign x[2][0] = A << 3;
-   assign x[2][1] = A >> 6;
-   assign x[2][2] = A >> 19;
-   assign x[2][3] = B >> 29;
-   assign x[2][4] = B << 13;
-   assign x[2][5] = '0;      
-
-   // sha512sig1l
-   assign x[3][0] = A << 3; 
-   assign x[3][1] = A >> 6; 
-   assign x[3][2] = A >> 19; 
-   assign x[3][3] = B >> 29;
-   assign x[3][4] = B << 26;
-   assign x[3][5] = B << 13;
+   // sha512sig1h/l
+   assign x[1][0] = {A[28:0], B[31:29]};                       // ror 61
+   assign x[1][1] = {B[18:0], A[31:19]};                       // ror 19
+   assign x[1][2] = {B[5:0] & {6{ZKNHSelect[0]}}, A[31:6]};    // ror/srl 6

   // sha512sum0r
-   assign x[4][0] = A << 25; 
-   assign x[4][1] = A << 30; 
-   assign x[4][2] = A >> 28; 
-   assign x[4][3] = B >> 7;
-   assign x[4][4] = B >> 2;
-   assign x[4][5] = B << 4;
+   assign x[2][0] = {A[6:0], B[31:7]};                         // ror 39
+   assign x[2][1] = {A[1:0], B[31:2]};                         // ror 34
+   assign x[2][2] = {B[27:0], A[31:28]};                       // ror 28

   // sha512sum1r
-   assign x[5][0] = A << 23; 
-   assign x[5][1] = A >> 14; 
-   assign x[5][2] = A >> 18; 
-   assign x[5][3] = B >> 9;
-   assign x[5][4] = B << 18;
-   assign x[5][5] = B << 14;   
+   assign x[3][0] = {A[8:0], B[31:9]};                         // ror 41
+   assign x[3][1] = {B[13:0], A[31:14]};                       // ror 14
+   assign x[3][2] = {B[17:0], A[31:18]};                       // ror 18

   // 32-bit muxes to select inputs to xor6 for sha512
-   assign y[0] = x[ZKNHSelect[2:0]][0]; 
-   assign y[1] = x[ZKNHSelect[2:0]][1]; 
-   assign y[2] = x[ZKNHSelect[2:0]][2];
-   assign y[3] = x[ZKNHSelect[2:0]][3]; 
-   assign y[4] = x[ZKNHSelect[2:0]][4]; 
-   assign y[5] = x[ZKNHSelect[2:0]][5];    
-
+   assign y[0] = x[ZKNHSelect[2:1]][0];
+   assign y[1] = x[ZKNHSelect[2:1]][1];
+   assign y[2] = x[ZKNHSelect[2:1]][2];
+ 
   // sha512 32-bit xor6
-   assign result = y[0] ^ y[1] ^ y[2] ^ y[3] ^ y[4] ^ y[5];
+   assign result = y[0] ^ y[1] ^ y[2];
 endmodule
--- a/src/ieu/sha/sha512_64.sv
+++ b/src/ieu/sha/sha512_64.sv
@ -33,33 +33,33 @@ module sha512_64 (

   logic [63:0] x[4][3];
   logic [63:0] y[3];
-   
-   // sha512{sig0/sig1/sum0/sum1} select shifted operands for 64-bit xor3
+
+   // sha512{sig0/sig1/sum0/sum1} select rotated/shifted operands for 64-bit xor3

   // sha512sig0
-   assign x[0][0] = {A[0],   A[63:1]};
-   assign x[0][1] = {A[7:0], A[63:8]};
-   assign x[0][2] = A >> 7;
+   assign x[0][0] = {A[0],   A[63:1]};    // ror 1
+   assign x[0][1] = {A[7:0], A[63:8]};    // ror 8
+   assign x[0][2] = {7'b0,   A[63:7]};    // >> 7

   // sha512sig1
-   assign x[1][0] = {A[18:0], A[63:19]};
-   assign x[1][1] = {A[60:0], A[63:61]};
-   assign x[1][2] = A >> 6;
+   assign x[1][0] = {A[18:0], A[63:19]};  // ror 19
+   assign x[1][1] = {A[60:0], A[63:61]};  // ror 61
+   assign x[1][2] = {6'b0,    A[63:6]};   // >> 6

   // sha512sum0
-   assign x[2][0] = {A[27:0], A[63:28]};
-   assign x[2][1] = {A[33:0], A[63:34]};
-   assign x[2][2] = {A[38:0], A[63:39]};
+   assign x[2][0] = {A[27:0], A[63:28]};  // ror 28
+   assign x[2][1] = {A[33:0], A[63:34]};  // ror 34
+   assign x[2][2] = {A[38:0], A[63:39]};  // ror 39

   // sha512sum1
-   assign x[3][0] = {A[13:0], A[63:14]};
-   assign x[3][1] = {A[17:0], A[63:18]};
-   assign x[3][2] = {A[40:0], A[63:41]};
+   assign x[3][0] = {A[13:0], A[63:14]};  // ror 14
+   assign x[3][1] = {A[17:0], A[63:18]};  // ror 18
+   assign x[3][2] = {A[40:0], A[63:41]};  // ror 41

   // 64-bit muxes to select inputs to xor3 for sha512
-   assign y[0] = x[ZKNHSelect[1:0]][0]; 
-   assign y[1] = x[ZKNHSelect[1:0]][1]; 
-   assign y[2] = x[ZKNHSelect[1:0]][2]; 
+   assign y[0] = x[ZKNHSelect[1:0]][0];
+   assign y[1] = x[ZKNHSelect[1:0]][1];
+   assign y[2] = x[ZKNHSelect[1:0]][2];

   // sha512 64-bit xor3
   assign result = y[0] ^ y[1] ^ y[2];