Merge branch 'main' of github.com:davidharrishmc/riscv-wally into main

2021-07-05 16:07:27 -05:00 · 2021-07-05 16:07:27 -05:00 · 2a62ee2e70
commit 2a62ee2e70
parent b2c5c3f637 5f91b339aa
30 changed files with 446 additions and 427 deletions
--- a/wally-pipelined/regression/wave-dos/peripheral-waves.do
+++ b/wally-pipelined/regression/wave-dos/peripheral-waves.do
@ -9,7 +9,8 @@ add wave /testbench/clk
 add wave /testbench/reset
 add wave -divider

-add wave /testbench/dut/hart/DataStall
+#add wave /testbench/dut/hart/DataStall
+add wave /testbench/debug
 add wave /testbench/dut/hart/StallF
 add wave /testbench/dut/hart/StallD
 add wave /testbench/dut/hart/StallE
--- a/wally-pipelined/src/cache/ICacheCntrl.sv
+++ b/wally-pipelined/src/cache/ICacheCntrl.sv
@ -115,8 +115,8 @@ module ICacheCntrl #(parameter BLOCKLEN = 256)
  localparam STATE_INVALIDATE = 'h12; // *** not sure if invalidate or evict? invalidate by cache block or address?
  localparam STATE_TLB_MISS = 'h13;
  localparam STATE_TLB_MISS_DONE = 'h14;
-  
-  
+  localparam STATE_INSTR_PAGE_FAULT = 'h15;
+
  
  localparam AHBByteLength = `XLEN / 8;
  localparam AHBOFFETWIDTH = $clog2(AHBByteLength);
@ -370,13 +370,20 @@ module ICacheCntrl #(parameter BLOCKLEN = 256)
        NextState = STATE_READY;
      end
      STATE_TLB_MISS: begin
-        if (ITLBWriteF | WalkerInstrPageFaultF) begin
+        if (WalkerInstrPageFaultF) begin
+          NextState = STATE_INSTR_PAGE_FAULT;
+          ICacheStallF = 1'b0;
+        end else if (ITLBWriteF) begin
          NextState = STATE_TLB_MISS_DONE;
        end else begin
          NextState = STATE_TLB_MISS;
        end
      end
-      STATE_TLB_MISS_DONE : begin
+      STATE_TLB_MISS_DONE: begin
+        NextState = STATE_READY;
+      end
+      STATE_INSTR_PAGE_FAULT: begin
+        ICacheStallF = 1'b0;
        NextState = STATE_READY;
      end
      default: begin
@ -425,8 +432,8 @@ module ICacheCntrl #(parameter BLOCKLEN = 256)
  // store read data from memory interface before writing into SRAM.
  genvar 				i;
  generate
-    for (i = 0; i < WORDSPERLINE; i++) begin
-      flopenr #(`XLEN) flop(.clk(clk),
+    for (i = 0; i < WORDSPERLINE; i++) begin:storebuffer
+      flopenr #(`XLEN) sb(.clk(clk),
 			    .reset(reset), 
 			    .en(InstrAckF & (i == FetchCount)),
 			    .d(InstrInF),
--- a/wally-pipelined/src/cache/dmapped.sv
+++ b/wally-pipelined/src/cache/dmapped.sv
@ -106,7 +106,7 @@ module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par
    assign DataWord = ReadLineTransformed[ReadOffset];
    genvar i;
    generate
-        for (i=0; i < LINESIZE/WORDSIZE; i++) begin
+        for (i=0; i < LINESIZE/WORDSIZE; i++) begin:readline
            assign ReadLineTransformed[i] = ReadLine[(i+1)*WORDSIZE-1:i*WORDSIZE];
        end
    endgenerate
@ -214,7 +214,7 @@ module wtdirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par
    assign DataWord = ReadLineTransformed[ReadOffset];
    genvar i;
    generate
-        for (i=0; i < LINESIZE/WORDSIZE; i++) begin
+        for (i=0; i < LINESIZE/WORDSIZE; i++) begin:readline
            assign ReadLineTransformed[i] = ReadLine[(i+1)*WORDSIZE-1:i*WORDSIZE];
        end
    endgenerate
--- a/wally-pipelined/src/ebu/ahblite.sv
+++ b/wally-pipelined/src/ebu/ahblite.sv
@ -216,11 +216,9 @@ module ahblite (
  subwordread swr(.*);

  // Handle AMO instructions if applicable
-  generate 
+  generate
    if (`A_SUPPORTED) begin
      logic [`XLEN-1:0] AMOResult;
-//      amoalu amoalu(.a(HRDATA), .b(WriteDataM), .funct(Funct7M), .width(MemSizeM), 
-//                    .result(AMOResult));
      amoalu amoalu(.srca(HRDATAW), .srcb(WriteDataM), .funct(Funct7M), .width(MemSizeM), 
                    .result(AMOResult));
      mux2 #(`XLEN) wdmux(WriteDataM, AMOResult, AtomicMaskedM[1], WriteData);
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -43,90 +43,93 @@ module fpu (
  output logic [4:0] 	   SetFflagsM,       // FPU flags
  output logic [`XLEN-1:0] FPUResultW);      // FPU result
 // *** change FMA to do 16 - 32 - 64 - 128 FEXPBITS 
-   // control logic signal instantiation
-   logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;              // FP register write enable
-   logic [2:0] 	FrmD, FrmE, FrmM;                                  // FP rounding mode
-   logic 		   FmtD, FmtE, FmtM, FmtW;                                  // FP precision 0-single 1-double
-   logic 		   FDivStartD, FDivStartE;                                  // Start division
-   logic 		   FWriteIntD;                                              // Write to integer register
-   logic [1:0]    ForwardXE, ForwardYE, ForwardZE;                        // Input3 forwarding mux control signal
-   logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
-   logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM;                  // Select which opperation to do in each component
-   logic [1:0]    FResSelD, FResSelE, FResSelM;  
-   logic [1:0]    FIntResSelD, FIntResSelE, FIntResSelM;                                   
-   logic [4:0] 	Adr1E, Adr2E, Adr3E;
-   
-   // regfile signals
-   logic [4:0]    RdE, RdM, RdW;                                           // what adress to write to    // ***Can take from ieu insted of pipelining
-   logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
-   logic [63:0] 	FRD1E, FRD2E, FRD3E;                                     // Read Data from FP register - execute stage
-   logic [`XLEN-1:0]   SrcXMAligned;
-   logic [63:0] 	SrcXE, SrcXM;                         // Input 1 to the various units (after forwarding)
-   logic [63:0] 	SrcYE, SrcYM;                                      // Input 2 to the various units (after forwarding)
-   logic [63:0] 	SrcZE, SrcZM;                                      // Input 3 to the various units (after forwarding)
-   
-   // div/sqrt signals
-   logic [63:0] 	FDivResultM, FDivResultW;
-   logic [4:0]    FDivSqrtFlgM, FDivSqrtFlgW;
-   logic          FDivSqrtDoneE;
-   logic [63:0] 	DivInput1E, DivInput2E;
-   logic          HoldInputs;                                              // keep forwarded inputs arround durring division
-   
-   // FMA signals
-	logic [105:0]	ProdManE, ProdManM; ///*** put pipline stages in units
-	logic [161:0]	AlignedAddendE, AlignedAddendM;                       
-	logic [12:0]	ProdExpE, ProdExpM;
-	logic 			AddendStickyE, AddendStickyM;
-	logic 			KillProdE, KillProdM;
-	logic				XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
-	logic				XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
-	logic				XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
-   logic [63:0]   FMAResM, FMAResW;
-   logic [4:0]    FMAFlgM, FMAFlgW;

-   // add/cvt signals
-   logic [63:0] 	AddSumE, AddSumM;
-   logic [63:0]   AddSumTcE, AddSumTcM;
-   logic [3:0] 	AddSelInvE, AddSelInvM;
-   logic [10:0] 	AddExpPostSumE,AddExpPostSumM;
-   logic 		   AddCorrSignE, AddCorrSignM;
-   logic          AddOp1NormE, AddOp1NormM;
-   logic          AddOp2NormE, AddOp2NormM;
-   logic          AddOpANormE,  AddOpANormM;
-   logic          AddOpBNormE, AddOpBNormM;
-   logic          AddInvalidE, AddInvalidM;
-   logic 		   AddDenormInE, AddDenormInM;
-   logic          AddSwapE, AddSwapM;
-   logic          AddNormOvflowE, AddNormOvflowM; //***this isn't used in addcvt2
-   logic          AddSignAE, AddSignAM;
-   logic 		   AddConvertE, AddConvertM;
-   logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
-   logic [11:0] 	AddExp1DenormE, AddExp2DenormE, AddExp1DenormM, AddExp2DenormM;
-   logic [10:0] 	AddExponentE, AddExponentM;
-   logic [63:0] 	FAddResM, FAddResW;
-   logic [4:0] 	FAddFlgM, FAddFlgW;  
-   
-   // cmp signals 
-   logic 		   CmpNVE, CmpNVM, CmpNVW;
-   logic [63:0] 	CmpResE, CmpResM, CmpResW;
-   
-   // fsgn signals
-   logic [63:0] 	SgnResE, SgnResM;
-   logic        	SgnNVE, SgnNVM, SgnNVW;
-   logic [63:0]   FResM, FResW;
-   logic          FFlgM, FFlgW;
-   
-   // instantiation of W stage regfile signals
-   logic [63:0] 	AlignedSrcAM;
-   
-   // classify signals
-   logic [63:0] 	ClassResE, ClassResM;
-   
-   // 64-bit FPU result   
-   logic [63:0] 	FPUResult64W;                                           
-   logic [4:0] 	FPUFlagsW;
-   
-   
+  generate
+     if (`F_SUPPORTED) begin 
+      // control logic signal instantiation
+      logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;              // FP register write enable
+      logic [2:0] 	FrmD, FrmE, FrmM;                                  // FP rounding mode
+      logic 		   FmtD, FmtE, FmtM, FmtW;                                  // FP precision 0-single 1-double
+      logic 		   FDivStartD, FDivStartE;                                  // Start division
+      logic 		   FWriteIntD;                                              // Write to integer register
+      logic [1:0]    ForwardXE, ForwardYE, ForwardZE;                        // Input3 forwarding mux control signal
+      logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
+      logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM;                  // Select which opperation to do in each component
+      logic [1:0]    FResSelD, FResSelE, FResSelM;  
+      logic [1:0]    FIntResSelD, FIntResSelE, FIntResSelM;                                   
+      logic [4:0] 	Adr1E, Adr2E, Adr3E;
+      
+      // regfile signals
+      logic [4:0]    RdE, RdM, RdW;                                           // what adress to write to    // ***Can take from ieu insted of pipelining
+      logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
+      logic [63:0] 	FRD1E, FRD2E, FRD3E;                                     // Read Data from FP register - execute stage
+      logic [`XLEN-1:0]   SrcXMAligned;
+      logic [63:0] 	SrcXE, SrcXM;                         // Input 1 to the various units (after forwarding)
+      logic [63:0] 	SrcYE, SrcYM;                                      // Input 2 to the various units (after forwarding)
+      logic [63:0] 	SrcZE, SrcZM;                                      // Input 3 to the various units (after forwarding)
+      
+      // div/sqrt signals
+      logic [63:0] 	FDivResultM, FDivResultW;
+      logic [4:0]    FDivSqrtFlgM, FDivSqrtFlgW;
+      logic          FDivSqrtDoneE;
+      logic [63:0] 	DivInput1E, DivInput2E;
+      logic          HoldInputs;                                              // keep forwarded inputs arround durring division
+      
+      // FMA signals
+      logic [105:0]	ProdManE, ProdManM; ///*** put pipline stages in units
+      logic [161:0]	AlignedAddendE, AlignedAddendM;                       
+      logic [12:0]	ProdExpE, ProdExpM;
+      logic 			AddendStickyE, AddendStickyM;
+      logic 			KillProdE, KillProdM;
+      logic				XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
+      logic				XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
+      logic				XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
+      logic [63:0]   FMAResM, FMAResW;
+      logic [4:0]    FMAFlgM, FMAFlgW;
+
+      // add/cvt signals
+      logic [63:0] 	AddSumE, AddSumM;
+      logic [63:0]   AddSumTcE, AddSumTcM;
+      logic [3:0] 	AddSelInvE, AddSelInvM;
+      logic [10:0] 	AddExpPostSumE,AddExpPostSumM;
+      logic 		   AddCorrSignE, AddCorrSignM;
+      logic          AddOp1NormE, AddOp1NormM;
+      logic          AddOp2NormE, AddOp2NormM;
+      logic          AddOpANormE,  AddOpANormM;
+      logic          AddOpBNormE, AddOpBNormM;
+      logic          AddInvalidE, AddInvalidM;
+      logic 		   AddDenormInE, AddDenormInM;
+      logic          AddSwapE, AddSwapM;
+      logic          AddNormOvflowE, AddNormOvflowM; //***this isn't used in addcvt2
+      logic          AddSignAE, AddSignAM;
+      logic 		   AddConvertE, AddConvertM;
+      logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
+      logic [11:0] 	AddExp1DenormE, AddExp2DenormE, AddExp1DenormM, AddExp2DenormM;
+      logic [10:0] 	AddExponentE, AddExponentM;
+      logic [63:0] 	FAddResM, FAddResW;
+      logic [4:0] 	FAddFlgM, FAddFlgW;  
+      
+      // cmp signals 
+      logic 		   CmpNVE, CmpNVM, CmpNVW;
+      logic [63:0] 	CmpResE, CmpResM, CmpResW;
+      
+      // fsgn signals
+      logic [63:0] 	SgnResE, SgnResM;
+      logic        	SgnNVE, SgnNVM, SgnNVW;
+      logic [63:0]   FResM, FResW;
+      logic          FFlgM, FFlgW;
+      
+      // instantiation of W stage regfile signals
+      logic [63:0] 	AlignedSrcAM;
+      
+      // classify signals
+      logic [63:0] 	ClassResE, ClassResM;
+      
+      // 64-bit FPU result   
+      logic [63:0] 	FPUResult64W;                                           
+      logic [4:0] 	FPUFlagsW;
+      
+      



@ -134,189 +137,19 @@ module fpu (



-   //DECODE STAGE
-   
-   
-   // top-level controller for FPU
-   fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
-               .FRM_REGW, .IllegalFPUInstrD, .FWriteEnD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
-               .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
-   
-   // regfile instantiation
-   fregfile fregfile (clk, reset, FWriteEnW,
-			InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
-			FPUResult64W,
-			FRD1D, FRD2D, FRD3D);	
-   
-
-
-
-
-
-
-
-
-   //*****************
-   // D/E pipe registers
-   //*****************
-   flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
-   flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
-   flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-   flopenrc #(1) DECtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
-   flopenrc #(15) DECtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-                                                         {Adr1E,         Adr2E,         Adr3E});
-   flopenrc #(22) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-                        {FWriteEnD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FWriteIntD},
-                        {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE});
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-   //EXECUTION STAGE
-   
-   // Hazard unit for FPU
-   fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FWriteEnM, .FWriteEnW, .RdM, .RdW, .FResultSelM, .FStallD, 
-                     .ForwardXE, .ForwardYE, .ForwardZE);
-
-   // forwarding muxs
-   mux3  #(64)  fxemux(FRD1E, FPUResult64W, FResM, ForwardXE, SrcXE);
-   mux3  #(64)  fyemux(FRD2E, FPUResult64W, FResM, ForwardYE, SrcYE);
-   mux3  #(64)  fzemux(FRD3E, FPUResult64W, FResM, ForwardZE, SrcZE);
-
-   
-   // first of two-stage instance of floating-point fused multiply-add unit
-   fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .ProdManE, .AlignedAddendE,
-               .ProdExpE, .AddendStickyE, .KillProdE, .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE,
-               .XNaNE, .YNaNE, .ZNaNE );
-   
-   // first and only instance of floating-point divider
-   logic fpdivClk;
-   
-   clockgater fpdivclkg(.E(FDivStartE),
-			.SE(1'b0),
-			.CLK(clk),
-			.ECLK(fpdivClk));
-   
-   // capture the inputs for div/sqrt	 
-   flopenrc #(64) reg_input1 (.d(SrcXE), .q(DivInput1E),
-               .en(~HoldInputs), .clear(FDivSqrtDoneE),
-               .reset(reset),  .clk(clk));
-   flopenrc #(64) reg_input2 (.d(SrcYE), .q(DivInput2E),
-               .en(~HoldInputs), .clear(FDivSqrtDoneE),
-               .reset(reset),  .clk(clk));
-
-   fdivsqrt fdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
-                     .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
-                     .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
-   
-
-
-   // first of two-stage instance of floating-point add/cvt unit
-   fpuaddcvt1 fpadd1 (.SrcXE, .SrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
-                     .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
-                     .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
-                     .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
-   
-   // first and only instance of floating-point comparator
-   fcmp fcmp (SrcXE, SrcYE, FOpCtrlE[2:0], FmtE, CmpNVE, CmpResE);
-   
-   // first and only instance of floating-point sign converter
-   fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .SrcXE, .SrcYE, .SgnResE, .SgnNVE);
-   
-   // first and only instance of floating-point classify unit
-   fclassify fclassify (.SrcXE, .FmtE, .ClassResE);
-
-   // output for store instructions
-   assign FWriteDataE = FmtE ? SrcYE[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcYE[63:32]};
-   //***swap to mux
-
-
-
-
-
-
-
-
-
-
-   //*****************
-   // E/M pipe registers
-   //*****************
-   flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, SrcXE, SrcXM);
-   flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, SrcYE, SrcYM);
-   flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, SrcZE, SrcZM);
-   
-   flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
-   flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
-   flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
-   flopenrc #(11) EMRegFma4(clk, reset, FlushM, ~StallM, 
-                              {AddendStickyE, KillProdE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE},
-                              {AddendStickyM, KillProdM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM});
-
-   flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
-   flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
-   flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
-   flopenrc #(64) EMRegAdd4(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
-   flopenrc #(64) EMRegAdd5(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
-   flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
-   flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
-   flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
-   flopenrc #(15) EMRegAdd9(clk, reset, FlushM, ~StallM, 
-                           {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE},
-                           {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM}); 
-
-   flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
-   flopenrc #(64) EMRegCmp2(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
-   
-   flopenrc #(64) EMRegSgn1(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
-   flopenrc #(1) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
-   
-   flopenrc #(22) EMCtrlReg(clk, reset, FlushM, ~StallM,
-                        {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE, FOpCtrlE, FWriteIntE},
-                        {FWriteEnM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, RdM, FOpCtrlM, FWriteIntM});
-
-   flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
-   
-
-
-
-
-
-
-
-   //BEGIN MEMORY STAGE
-   
-   mux3  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, FResSelM, FResM);
-   mux3  #(1)  FFlgMux(1'b0, SgnNVM, CmpNVM, FResSelM, FFlgM);
-
-   //***change to mux
-   assign SrcXMAligned = FmtM ? SrcXM[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcXM[63:32]};
-   mux3  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], SrcXMAligned, ClassResM[`XLEN-1:0], FIntResSelM, FIntResM);
-
-   // second instance of two-stage FMA unit
-   fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM(FOpCtrlM[2:0]), .FrmM, .FmtM, 
-            .ProdManM, .AlignedAddendM, .ProdExpM, .AddendStickyM, .KillProdM, 
-            .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, 
-            .FMAResM, .FMAFlgM);
-   
-   // second instance of two-stage floating-point add/cvt unit
-   fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
-                     .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, 
-                     .AddOp1NormM, .AddOp2NormM, .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
-                     .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
-   
-   // Align SrcA to MSB when single precicion
-   mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
+      //DECODE STAGE
+      
+      
+      // top-level controller for FPU
+      fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
+                  .FRM_REGW, .IllegalFPUInstrD, .FWriteEnD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
+                  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
+      
+      // regfile instantiation
+      fregfile fregfile (clk, reset, FWriteEnW,
+            InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
+            FPUResult64W,
+            FRD1D, FRD2D, FRD3D);	
      


@ -326,77 +159,260 @@ module fpu (



+      //*****************
+      // D/E pipe registers
+      //*****************
+      flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
+      flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
+      flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
+      flopenrc #(1) DECtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
+      flopenrc #(15) DECtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+                                                            {Adr1E,         Adr2E,         Adr3E});
+      flopenrc #(22) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+                           {FWriteEnD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FWriteIntD},
+                           {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE});
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+      //EXECUTION STAGE
+      
+      // Hazard unit for FPU
+      fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FWriteEnM, .FWriteEnW, .RdM, .RdW, .FResultSelM, .FStallD, 
+                        .ForwardXE, .ForwardYE, .ForwardZE);
+
+      // forwarding muxs
+      mux3  #(64)  fxemux(FRD1E, FPUResult64W, FResM, ForwardXE, SrcXE);
+      mux3  #(64)  fyemux(FRD2E, FPUResult64W, FResM, ForwardYE, SrcYE);
+      mux3  #(64)  fzemux(FRD3E, FPUResult64W, FResM, ForwardZE, SrcZE);
+
+      
+      // first of two-stage instance of floating-point fused multiply-add unit
+      fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .ProdManE, .AlignedAddendE,
+                  .ProdExpE, .AddendStickyE, .KillProdE, .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE,
+                  .XNaNE, .YNaNE, .ZNaNE );
+      
+      // first and only instance of floating-point divider
+      logic fpdivClk;
+      
+      clockgater fpdivclkg(.E(FDivStartE),
+            .SE(1'b0),
+            .CLK(clk),
+            .ECLK(fpdivClk));
+      
+      // capture the inputs for div/sqrt	 
+      flopenrc #(64) reg_input1 (.d(SrcXE), .q(DivInput1E),
+                  .en(~HoldInputs), .clear(FDivSqrtDoneE),
+                  .reset(reset),  .clk(clk));
+      flopenrc #(64) reg_input2 (.d(SrcYE), .q(DivInput2E),
+                  .en(~HoldInputs), .clear(FDivSqrtDoneE),
+                  .reset(reset),  .clk(clk));
+
+      fdivsqrt fdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
+                        .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
+                        .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
+      
+
+
+      // first of two-stage instance of floating-point add/cvt unit
+      fpuaddcvt1 fpadd1 (.SrcXE, .SrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
+                        .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
+                        .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
+                        .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
+      
+      // first and only instance of floating-point comparator
+      fcmp fcmp (SrcXE, SrcYE, FOpCtrlE[2:0], FmtE, CmpNVE, CmpResE);
+      
+      // first and only instance of floating-point sign converter
+      fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .SrcXE, .SrcYE, .SgnResE, .SgnNVE);
+      
+      // first and only instance of floating-point classify unit
+      fclassify fclassify (.SrcXE, .FmtE, .ClassResE);
+
+      // output for store instructions
+      assign FWriteDataE = FmtE ? SrcYE[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcYE[63:32]};
+      //***swap to mux
+
+
+
+
+
+
+
+
+
+
+      //*****************
+      // E/M pipe registers
+      //*****************
+      flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, SrcXE, SrcXM);
+      flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, SrcYE, SrcYM);
+      flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, SrcZE, SrcZM);
+      
+      flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
+      flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
+      flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
+      flopenrc #(11) EMRegFma4(clk, reset, FlushM, ~StallM, 
+                                 {AddendStickyE, KillProdE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE},
+                                 {AddendStickyM, KillProdM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM});
+
+      flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
+      flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
+      flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
+      flopenrc #(64) EMRegAdd4(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
+      flopenrc #(64) EMRegAdd5(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
+      flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
+      flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
+      flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
+      flopenrc #(15) EMRegAdd9(clk, reset, FlushM, ~StallM, 
+                              {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE},
+                              {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM}); 
+
+      flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
+      flopenrc #(64) EMRegCmp2(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
+      
+      flopenrc #(64) EMRegSgn1(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
+      flopenrc #(1) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
+      
+      flopenrc #(22) EMCtrlReg(clk, reset, FlushM, ~StallM,
+                           {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE, FOpCtrlE, FWriteIntE},
+                           {FWriteEnM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, RdM, FOpCtrlM, FWriteIntM});
+
+      flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
+      
+
+
+
+
+
+
+
+      //BEGIN MEMORY STAGE
+      
+      mux3  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, FResSelM, FResM);
+      mux3  #(1)  FFlgMux(1'b0, SgnNVM, CmpNVM, FResSelM, FFlgM);
+
+      //***change to mux
+      assign SrcXMAligned = FmtM ? SrcXM[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcXM[63:32]};
+      mux3  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], SrcXMAligned, ClassResM[`XLEN-1:0], FIntResSelM, FIntResM);
+
+      // second instance of two-stage FMA unit
+      fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM(FOpCtrlM[2:0]), .FrmM, .FmtM, 
+               .ProdManM, .AlignedAddendM, .ProdExpM, .AddendStickyM, .KillProdM, 
+               .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, 
+               .FMAResM, .FMAFlgM);
+      
+      // second instance of two-stage floating-point add/cvt unit
+      fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
+                        .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, 
+                        .AddOp1NormM, .AddOp2NormM, .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
+                        .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
+      
+      // Align SrcA to MSB when single precicion
+      mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
         
-   //*****************
-   // M/W pipe registers
-   //*****************
-   flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
-   flopenrc #(5) MWRegFma2(clk, reset, FlushW, ~StallW, FMAFlgM, FMAFlgW); 
-   
-   flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
-   flopenrc #(5) MWRegDiv2(clk, reset, FlushW, ~StallW, FDivSqrtFlgM, FDivSqrtFlgW);
-   
-   flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
-   flopenrc #(5) MWRegAdd2(clk, reset, FlushW, ~StallW, FAddFlgM, FAddFlgW); 
-   
-   flopenrc #(1) MWRegCmp1(clk, reset, FlushW, ~StallW, CmpNVM, CmpNVW); 
-   flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, CmpResM, CmpResW);
-
-   flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
-   flopenrc #(1) MWRegClass1(clk, reset, FlushW, ~StallW, FFlgM, FFlgW);
-   
-   flopenrc #(11) MWCtrlReg(clk, reset, FlushW, ~StallW,
-                        {FWriteEnM, FResultSelM, RdM, FmtM, FWriteIntM},
-                        {FWriteEnW, FResultSelW, RdW, FmtW, FWriteIntW});
-   
-   
-
-
-
-
-  //#########################################
-  // BEGIN WRITEBACK STAGE
-  //#########################################





-//***turn into muxs
-   always_comb begin
-      case (FResultSelW)
-	3'b000 : FPUFlagsW = 5'b0;
-	3'b001 : FPUFlagsW = FMAFlgW;
-	3'b010 : FPUFlagsW = FAddFlgW;
-	3'b011 : FPUFlagsW = FDivSqrtFlgW;
-	3'b100 : FPUFlagsW = {4'b0,FFlgW};
-	default : FPUFlagsW = 5'bxxxxx;
-      endcase
-   end
-
-   always_comb begin
-      case (FResultSelW)
-	3'b000 : FPUResult64W = FmtW ? {ReadDataW, {64-`XLEN{1'b0}}} : {ReadDataW[31:0], 32'b0};
-	3'b001 : FPUResult64W = FMAResW;
-	3'b010 : FPUResult64W = FAddResW;
-	3'b011 : FPUResult64W = FDivResultW;
-	3'b100 : FPUResult64W = FResW;
-	default : FPUResult64W = 64'bxxxxx;
-      endcase
-   end
-   
-   
-   // interface between XLEN size datapath and double-precision sized
-   // floating-point results
-   //
-   // define offsets for LSB zero extension or truncation
-   always_comb begin      
-      // zero extension 
-//***turn into mux
-      FPUResultW = FmtW ? FPUResult64W[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FPUResult64W[63:32]};
-      //*** put into mem stage
-      SetFflagsM = FPUFlagsW;      
+
+
+
+            
+      //*****************
+      // M/W pipe registers
+      //*****************
+      flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
+      flopenrc #(5) MWRegFma2(clk, reset, FlushW, ~StallW, FMAFlgM, FMAFlgW); 
+      
+      flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
+      flopenrc #(5) MWRegDiv2(clk, reset, FlushW, ~StallW, FDivSqrtFlgM, FDivSqrtFlgW);
+      
+      flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
+      flopenrc #(5) MWRegAdd2(clk, reset, FlushW, ~StallW, FAddFlgM, FAddFlgW); 
+      
+      flopenrc #(1) MWRegCmp1(clk, reset, FlushW, ~StallW, CmpNVM, CmpNVW); 
+      flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, CmpResM, CmpResW);
+
+      flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
+      flopenrc #(1) MWRegClass1(clk, reset, FlushW, ~StallW, FFlgM, FFlgW);
+      
+      flopenrc #(11) MWCtrlReg(clk, reset, FlushW, ~StallW,
+                           {FWriteEnM, FResultSelM, RdM, FmtM, FWriteIntM},
+                           {FWriteEnW, FResultSelW, RdW, FmtW, FWriteIntW});
+      
+      
+
+
+
+
+   //#########################################
+   // BEGIN WRITEBACK STAGE
+   //#########################################
+
+
+
+
+
+   //***turn into muxs
+      always_comb begin
+         case (FResultSelW)
+      3'b000 : FPUFlagsW = 5'b0;
+      3'b001 : FPUFlagsW = FMAFlgW;
+      3'b010 : FPUFlagsW = FAddFlgW;
+      3'b011 : FPUFlagsW = FDivSqrtFlgW;
+      3'b100 : FPUFlagsW = {4'b0,FFlgW};
+      default : FPUFlagsW = 5'bxxxxx;
+         endcase
+      end
+
+      always_comb begin
+         case (FResultSelW)
+      3'b000 : FPUResult64W = FmtW ? {ReadDataW, {64-`XLEN{1'b0}}} : {ReadDataW[31:0], 32'b0};
+      3'b001 : FPUResult64W = FMAResW;
+      3'b010 : FPUResult64W = FAddResW;
+      3'b011 : FPUResult64W = FDivResultW;
+      3'b100 : FPUResult64W = FResW;
+      default : FPUResult64W = 64'bxxxxx;
+         endcase
+      end
+      
+      
+      // interface between XLEN size datapath and double-precision sized
+      // floating-point results
+      //
+      // define offsets for LSB zero extension or truncation
+      always_comb begin      
+         // zero extension 
+   //***turn into mux
+         FPUResultW = FmtW ? FPUResult64W[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FPUResult64W[63:32]};
+         //*** put into mem stage
+         SetFflagsM = FPUFlagsW;      
+      end
+   end else begin // no F_SUPPORTED; tie outputs low
+     assign FStallD = 0;
+     assign FWriteIntE = 0; 
+     assign FWriteIntM = 0;
+     assign FWriteIntW = 0;
+     assign FWriteDataE = 0;
+     assign FIntResM = 0;
+     assign FDivBusyE = 0;
+     assign IllegalFPUInstrD = 1;
+     assign SetFflagsM = 0;
+     assign FPUResultW = 0;
   end
+  endgenerate 
  
 endmodule // fpu

--- a/wally-pipelined/src/generic/shift.sv
+++ b/wally-pipelined/src/generic/shift.sv
@ -38,13 +38,12 @@ module shift_right #(parameter WIDTH=8)

   assign stage[0] = A;   
   generate
-      for (i=0;i<$clog2(WIDTH);i=i+1)
-	begin : genbit
-	   mux2 #(WIDTH) mux_inst (stage[i], 
+      for (i=0;i<$clog2(WIDTH);i=i+1) begin : genbit
+	      mux2 #(WIDTH) mux_inst (stage[i], 
 				   {{(WIDTH/(2**(i+1))){1'b0}}, stage[i][WIDTH-1:WIDTH/(2**(i+1))]}, 
 				   Shift[$clog2(WIDTH)-i-1], 
 				   stage[i+1]);
-	end
+	   end
   endgenerate
   assign Z = stage[$clog2(WIDTH)];   

@ -60,13 +59,12 @@ module shift_left #(parameter WIDTH=8)
   
   assign stage[0] = A;   
   generate
-      for (i=0;i<$clog2(WIDTH);i=i+1)
-	begin : genbit
-	   mux2 #(WIDTH) mux_inst (stage[i], 
+      for (i=0;i<$clog2(WIDTH);i=i+1) begin : genbit
+	     mux2 #(WIDTH) mux_inst (stage[i], 
 				   {stage[i][WIDTH-1-WIDTH/(2**(i+1)):0], {(WIDTH/(2**(i+1))){1'b0}}}, 
 				   Shift[$clog2(WIDTH)-i-1], 
 				   stage[i+1]);
-	end
+	   end
   endgenerate
   assign Z = stage[$clog2(WIDTH)];   

--- a/wally-pipelined/src/ieu/alu.sv
+++ b/wally-pipelined/src/ieu/alu.sv
@ -42,7 +42,7 @@ module alu #(parameter WIDTH=32) (
  assign {carry, presum} = a + condinvb + {{(WIDTH-1){1'b0}},alucontrol[3]};
  
  // support W-type RV64I ADDW/SUBW/ADDIW that sign-extend 32-bit result to 64 bits
-  generate 
+  generate
    if (WIDTH==64)
      assign sum = w64 ? {{32{presum[31]}}, presum[31:0]} : presum;
    else
--- a/wally-pipelined/src/ieu/datapath.sv
+++ b/wally-pipelined/src/ieu/datapath.sv
@ -129,7 +129,7 @@ module datapath (
  flopenrc #(5)    RdWEg(clk, reset, FlushW, ~StallW, RdM, RdW);

  // handle Store Conditional result if atomic extension supported
-  generate 
+  generate
    if (`A_SUPPORTED)
      assign SCResultW = SquashSCW ? {{(`XLEN-1){1'b0}}, 1'b1} : {{(`XLEN-1){1'b0}}, 1'b0};
    else 
--- a/wally-pipelined/src/ifu/SRAM2P1R1W.sv
+++ b/wally-pipelined/src/ifu/SRAM2P1R1W.sv
@ -97,11 +97,11 @@ module SRAM2P1R1W
  
  // write port
  generate
-    for (index = 0; index < Width; index = index + 1) begin    
+    for (index = 0; index < Width; index = index + 1) begin:mem
      always_ff @ (posedge clk) begin
-	if (WEN1Q & BitWEN1[index]) begin
-	  memory[WA1Q][index] <= WD1Q[index];
-	end
+	      if (WEN1Q & BitWEN1[index]) begin
+	        memory[WA1Q][index] <= WD1Q[index];
+	      end
      end
    end
  endgenerate
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@ -188,7 +188,7 @@ module ifu (
  flopenl #(`XLEN) pcreg(clk, reset, ~StallF & ~ICacheStallF, PCNextF, `RESET_VECTOR, PCF);

  // branch and jump predictor
-  generate 
+  generate
    if (`BPRED_ENABLED == 1) begin : bpred
      // I am making the port connection explicit for now as I want to see them and they will be changing.
      bpred bpred(.*,
--- a/wally-pipelined/src/ifu/localHistoryPredictor.sv
+++ b/wally-pipelined/src/ifu/localHistoryPredictor.sv
@ -67,7 +67,7 @@ module localHistoryPredictor

  genvar 		   index;
  generate
-    for (index = 0; index < 2**m; index = index +1) begin
+    for (index = 0; index < 2**m; index = index +1) begin:localhist
      
      flopenr #(k) LocalHistoryRegister(.clk(clk),
 					.reset(reset),
--- a/wally-pipelined/src/lsu/dcache.sv
+++ b/wally-pipelined/src/lsu/dcache.sv
@ -151,7 +151,7 @@ module dcachecontroller #(parameter LINESIZE = 256) (

    genvar i;
    generate
-        for (i=0; i < WORDSPERLINE; i++) begin
+        for (i=0; i < WORDSPERLINE; i++) begin:sb
            flopenr #(`XLEN) flop(clk, reset, FetchState & (i == FetchWordNum), ReadDataW, DCacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]);
        end
    endgenerate
--- a/wally-pipelined/src/lsu/lsu.sv
+++ b/wally-pipelined/src/lsu/lsu.sv
@ -64,7 +64,7 @@ module lsu (
  output logic [1:0] 	      AtomicMaskedM,
  input logic 		      MemAckW, // from ahb
  input logic [`XLEN-1:0]     HRDATAW, // from ahb
-  output logic [2:0] 	      Funct3MfromLSU,
+  output logic [2:0] 	      SizeFromLSU,
  output logic 		      StallWfromLSU,


@ -132,7 +132,7 @@ module lsu (
  logic 	    MMUTranslate;
  logic 	    HPTWRead;
  logic [1:0] 	    MemRWMtoLSU;
-  logic [2:0] 	    Funct3MtoLSU;
+  logic [2:0] 	    SizeToLSU;
  logic [1:0] 	    AtomicMtoLSU;
  logic [`XLEN-1:0] MemAdrMtoLSU;
  logic [`XLEN-1:0] WriteDataMtoLSU;
@ -204,7 +204,7 @@ module lsu (
 		 // LSU
 		 .DisableTranslation(DisableTranslation),
 		 .MemRWMtoLSU(MemRWMtoLSU),
-		 .Funct3MtoLSU(Funct3MtoLSU),
+		 .SizeToLSU(SizeToLSU),
 		 .AtomicMtoLSU(AtomicMtoLSU),
 		 .MemAdrMtoLSU(MemAdrMtoLSU),          
 		 .WriteDataMtoLSU(WriteDataMtoLSU),   // *** ??????????????
@ -220,7 +220,7 @@ module lsu (
  mmu #(.TLB_ENTRIES(`DTLB_ENTRIES), .IMMU(0))
  dmmu(.TLBAccessType(MemRWMtoLSU),
       .VirtualAddress(MemAdrMtoLSU),
-       .Size(Funct3MtoLSU[1:0]),
+       .Size(SizeToLSU[1:0]),
       .PTEWriteVal(PageTableEntryM),
       .PageTypeWriteVal(PageTypeM),
       .TLBWrite(DTLBWriteM),
@ -244,7 +244,7 @@ module lsu (

  // Determine if an Unaligned access is taking place
  always_comb
-    case(Funct3MtoLSU[1:0]) 
+    case(SizeToLSU[1:0]) 
      2'b00:  DataMisalignedMfromLSU = 0;                       // lb, sb, lbu
      2'b01:  DataMisalignedMfromLSU = MemAdrMtoLSU[0];              // lh, sh, lhu
      2'b10:  DataMisalignedMfromLSU = MemAdrMtoLSU[1] | MemAdrMtoLSU[0]; // lw, sw, flw, fsw, lwu
@ -400,7 +400,7 @@ module lsu (
  end // always_comb

  // *** for now just pass through size
-  assign Funct3MfromLSU = Funct3MtoLSU;
+  assign SizeFromLSU = SizeToLSU;
  assign StallWfromLSU = StallWtoLSU;
  

--- a/wally-pipelined/src/lsu/lsuArb.sv
+++ b/wally-pipelined/src/lsu/lsuArb.sv
@ -54,7 +54,7 @@ module lsuArb
   // to LSU   
   output logic 	    DisableTranslation, 
   output logic [1:0] 	    MemRWMtoLSU,
-   output logic [2:0] 	    Funct3MtoLSU,
+   output logic [2:0] 	    SizeToLSU,
   output logic [1:0] 	    AtomicMtoLSU,
   output logic [`XLEN-1:0] MemAdrMtoLSU,
   output logic [`XLEN-1:0] WriteDataMtoLSU,
@ -87,6 +87,7 @@ module lsuArb
  statetype CurrState, NextState;
  logic 		    SelPTW;
  logic 		    HPTWStallD;
+  logic [2:0] PTWSize;
  

  flopenl #(.TYPE(statetype)) StateReg(.clk(clk),
@ -138,12 +139,9 @@ module lsuArb
  assign MemRWMtoLSU = SelPTW ? {HPTWRead, 1'b0} : MemRWM;
  
  generate
-    if (`XLEN == 32) begin
-      assign Funct3MtoLSU = SelPTW ? 3'b010 : Funct3M;
-    end else begin
-      assign Funct3MtoLSU = SelPTW ? 3'b011 : Funct3M;
-    end
+    assign PTWSize = (`XLEN==32 ? 3'b010 : 3'b011); // 32 or 64-bit access from htpw
  endgenerate
+  mux2 #(3) sizemux(Funct3M, PTWSize, SelPTW, SizeToLSU);

  assign AtomicMtoLSU = SelPTW ? 2'b00 : AtomicM;
  assign MemAdrMtoLSU = SelPTW ? HPTWPAdr : MemAdrM;
--- a/wally-pipelined/src/mmu/pmpadrdec.sv
+++ b/wally-pipelined/src/mmu/pmpadrdec.sv
@ -76,8 +76,9 @@ module pmpadrdec (
  generate
    assign Mask[1:0] = 2'b11;
    assign Mask[2] = (AdrMode == NAPOT); // mask has 0s in upper bis for NA4 region
-    for (i=3; i < `PA_BITS; i=i+1) 
+    for (i=3; i < `PA_BITS; i=i+1) begin:mask
      assign Mask[i] = Mask[i-1] & PMPAdr[i-3]; // NAPOT mask: 1's indicate bits to ignore
+    end
   endgenerate
  // verilator lint_on UNOPTFLAT

--- a/wally-pipelined/src/mmu/pmpchecker.sv
+++ b/wally-pipelined/src/mmu/pmpchecker.sv
@ -63,12 +63,6 @@ module pmpchecker (
  // verilator lint_on UNOPTFLAT
  logic [`PMP_ENTRIES-1:0]   PAgePMPAdr;  // for TOR PMP matching, PhysicalAddress > PMPAdr[i]
  genvar i,j;
- /*
-  generate // extract 8-bit chunks from PMPCFG array
-    for (j=0; j<`PMP_ENTRIES; j = j+8)
-      assign {PMPCfg[j+7], PMPCfg[j+6], PMPCfg[j+5], PMPCfg[j+4],
-              PMPCfg[j+3], PMPCfg[j+2], PMPCfg[j+1], PMPCfg[j]} = PMPCFG_ARRAY_REGW[j/8];
-  endgenerate */

  pmpadrdec pmpadrdecs[`PMP_ENTRIES-1:0](
    .PhysicalAddress, 
@ -80,7 +74,6 @@ module pmpchecker (
    .NoLowerMatchOut(NoLowerMatch),
    .Match, .Active, .L, .X, .W, .R);

-
  // Only enforce PMP checking for S and U modes when at least one PMP is active or in Machine mode when L bit is set in selected region
  assign EnforcePMP = (PrivilegeModeW == `M_MODE) ? |L : |Active; 

--- a/wally-pipelined/src/mmu/tlb.sv
+++ b/wally-pipelined/src/mmu/tlb.sv
@ -111,6 +111,7 @@ module tlb #(parameter TLB_ENTRIES = 8,
  logic [1:0]            HitPageType;
  logic                  CAMHit;
  logic [`ASID_BITS-1:0] ASID;
+  logic                  DAFault;

  // Grab the sv mode from SATP and determine whether translation should occur
  assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS];
@ -165,7 +166,9 @@ module tlb #(parameter TLB_ENTRIES = 8,
      // only execute non-user mode pages.
      assign ImproperPrivilege = ((EffectivePrivilegeMode == `U_MODE) && ~PTE_U) ||
        ((EffectivePrivilegeMode == `S_MODE) && PTE_U);
-      assign TLBPageFault = Translate && TLBHit && (ImproperPrivilege || ~PTE_X);
+      // fault for software handling if access bit is off
+      assign DAFault = ~PTE_A;
+      assign TLBPageFault = Translate && TLBHit && (ImproperPrivilege || ~PTE_X || DAFault);
    end else begin
      logic ImproperPrivilege, InvalidRead, InvalidWrite;

@ -180,7 +183,9 @@ module tlb #(parameter TLB_ENTRIES = 8,
      // Check for write error. Writes are invalid when the page's write bit is
      // low.
      assign InvalidWrite = WriteAccess && ~PTE_W;
-      assign TLBPageFault = Translate && TLBHit && (ImproperPrivilege || InvalidRead || InvalidWrite);
+      // Fault for software handling if access bit is off or writing a page with dirty bit off
+      assign DAFault = ~PTE_A | WriteAccess & ~PTE_D; 
+      assign TLBPageFault = Translate && TLBHit && (ImproperPrivilege || InvalidRead || InvalidWrite || DAFault);
    end
  endgenerate

--- a/wally-pipelined/src/mmu/tlbpriority.sv
+++ b/wally-pipelined/src/mmu/tlbpriority.sv
@ -41,8 +41,9 @@ module tlbpriority #(parameter ENTRIES = 8) (
  genvar i;
  generate
    assign nolower[0] = 1;
-    for (i=1; i<ENTRIES; i++) 
+    for (i=1; i<ENTRIES; i++) begin:therm
      assign nolower[i] = nolower[i-1] & ~a[i-1];
+    end
  endgenerate
  // verilator lint_on UNOPTFLAT
  assign y = a & nolower;
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@ -299,10 +299,9 @@ module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c,
   logic [WIDTH:0] 					  carry_temp;   
   genvar 						  i;
   generate
-      for (i=0;i<WIDTH;i=i+1)
-	begin : genbit
-	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
-	end
+      for (i=0;i<WIDTH;i=i+1) begin : genbit
+	    fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
+	  end
   endgenerate
   assign carry = {carry_temp[WIDTH-1:1], 1'b0};     

--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@ -138,7 +138,9 @@ module muldiv (
 	 flopenrc #(`XLEN) MulDivResultWReg(clk, reset, FlushW, ~StallW, MulDivResultM, MulDivResultW);	 

      end else begin // no M instructions supported
-	 assign MulDivResultW = 0; 
+	 	assign MulDivResultW = 0; 
+		assign DivBusyE = 0;
+		assign DivDoneE = 0;
      end
   endgenerate

--- a/wally-pipelined/src/privileged/csrc.sv
+++ b/wally-pipelined/src/privileged/csrc.sv
@ -87,7 +87,7 @@ module csrc #(parameter
    output logic             IllegalCSRCAccessM
  );

-  generate 
+  generate
    if (`ZCOUNTERS_SUPPORTED) begin
      //  logic [63:0] TIME_REGW, TIMECMP_REGW;
      logic [63:0] CYCLE_REGW, INSTRET_REGW;
--- a/wally-pipelined/src/privileged/csri.sv
+++ b/wally-pipelined/src/privileged/csri.sv
@ -70,7 +70,7 @@ module csri #(parameter
  // MEIP, MTIP, MSIP are read-only
  // SEIP, STIP, SSIP is writable in MIP if S mode exists
  // SSIP is writable in SIP if S mode exists
-  generate 
+  generate
    if (`S_SUPPORTED) begin
      assign MIP_WRITE_MASK = 12'h222; // SEIP, STIP, SSIP are writable in MIP (20210108-draft 3.1.9)
      assign SIP_WRITE_MASK = 12'h002; // SSIP is writable in SIP (privileged 20210108-draft 4.1.3)
--- a/wally-pipelined/src/privileged/csrn.sv
+++ b/wally-pipelined/src/privileged/csrn.sv
@ -49,7 +49,7 @@ module csrn #(parameter
  );

  // User mode CSRs below only needed when user mode traps are supported
-  generate  
+  generate
    if (`N_SUPPORTED) begin
      logic WriteUTVECM;
      logic WriteUSCRATCHM, WriteUEPCM;
--- a/wally-pipelined/src/privileged/csrs.sv
+++ b/wally-pipelined/src/privileged/csrs.sv
@ -66,7 +66,7 @@ module csrs #(parameter
  //logic [`XLEN-1:0] SEDELEG_MASK = ~(zero | 3'b111 << 9); // sedeleg[11:9] hardwired to zero per Privileged Spec 3.1.8

  // Supervisor mode CSRs sometimes supported
-  generate  
+  generate
    if (`S_SUPPORTED) begin
      logic WriteSTVECM;
      logic WriteSSCRATCHM, WriteSEPCM;
--- a/wally-pipelined/src/privileged/csru.sv
+++ b/wally-pipelined/src/privileged/csru.sv
@ -43,7 +43,7 @@ module csru #(parameter
  );

  // Floating Point CSRs in User Mode only needed if Floating Point is supported
-  generate  
+  generate
    if (`F_SUPPORTED | `D_SUPPORTED) begin
      logic [4:0] FFLAGS_REGW;
      logic WriteFFLAGSM, WriteFRMM; //, WriteFCSRM;
--- a/wally-pipelined/src/uncore/gpio.sv
+++ b/wally-pipelined/src/uncore/gpio.sv
@ -151,7 +151,7 @@ module gpio (
  end

  // chip i/o
-  generate 
+  generate
    if (`GPIO_LOOPBACK_TEST) // connect OUT to IN for loopback testing
      assign input0d = GPIOPinsOut & input_en & output_en;
    else
--- a/wally-pipelined/src/uncore/plic.sv
+++ b/wally-pipelined/src/uncore/plic.sv
@ -164,17 +164,13 @@ module plic (
  flopr #(N) intPendingFlop(HCLK,~HRESETn,nextIntPending,intPending);

  // pending array - indexed by priority_lvl x source_ID
-  genvar i;
+  genvar i, j;
  generate
-    for (i=1; i<=N; i=i+1) begin
-      // *** make sure that this synthesizes into N decoders, not 7*N 3-bit equality comparators (right?)
-      assign pendingArray[7][i] = (intPriority[i]==7) & intEn[i] & intPending[i];
-      assign pendingArray[6][i] = (intPriority[i]==6) & intEn[i] & intPending[i];
-      assign pendingArray[5][i] = (intPriority[i]==5) & intEn[i] & intPending[i];
-      assign pendingArray[4][i] = (intPriority[i]==4) & intEn[i] & intPending[i];
-      assign pendingArray[3][i] = (intPriority[i]==3) & intEn[i] & intPending[i];
-      assign pendingArray[2][i] = (intPriority[i]==2) & intEn[i] & intPending[i];
-      assign pendingArray[1][i] = (intPriority[i]==1) & intEn[i] & intPending[i];
+    for (j=1; j<=7; j++) begin: pending
+      for (i=1; i<=N; i=i+1) begin: pendingbit
+        // *** make sure that this synthesizes into N decoders, not 7*N 3-bit equality comparators (right?)
+        assign pendingArray[j][i] = (intPriority[i]==j) & intEn[i] & intPending[i];
+      end
    end
  endgenerate
  // pending array, except grouped by priority
@ -184,7 +180,9 @@ module plic (
                                 |pendingArray[4],
                                 |pendingArray[3],
                                 |pendingArray[2],
-                                 |pendingArray[1]};
+                                 |pendingArray[1]}; 
+  //assign pendingPGrouped = pendingArray.or;
+
  // pendingPGrouped, except only topmost priority is active
  assign pendingMaxP[7:1] = {pendingPGrouped[7],
                             pendingPGrouped[6] & ~|pendingPGrouped[7],
@ -202,24 +200,24 @@ module plic (
                                    | ({N{pendingMaxP[2]}} & pendingArray[2])
                                    | ({N{pendingMaxP[1]}} & pendingArray[1]);
  // find the lowest ID amongst active interrupts at the highest priority
-  integer j;
-  // *** verify that this synthesizes to a reasonable priority encoder and that j doesn't actually exist in hardware
+  int k;
+  // *** verify that this synthesizes to a reasonable priority encoder and that k doesn't actually exist in hardware
  always_comb begin
    intClaim = 6'b0;
-    for(j=N; j>0; j=j-1) begin
-      if(pendingRequestsAtMaxP[j]) intClaim = j[5:0];
+    for(k=N; k>0; k=k-1) begin
+      if(pendingRequestsAtMaxP[k]) intClaim = k[5:0];
    end
  end
  
  // create threshold mask
-  always_comb begin
-    threshMask[7] = ~(7==intThreshold);
-    threshMask[6] = ~(6==intThreshold) & threshMask[7];
-    threshMask[5] = ~(5==intThreshold) & threshMask[6];
-    threshMask[4] = ~(4==intThreshold) & threshMask[5];
-    threshMask[3] = ~(3==intThreshold) & threshMask[4];
-    threshMask[2] = ~(2==intThreshold) & threshMask[3];
-    threshMask[1] = ~(1==intThreshold) & threshMask[2];
+   always_comb begin
+    threshMask[7] = (intThreshold != 7);
+    threshMask[6] = (intThreshold != 6) & threshMask[7];
+    threshMask[5] = (intThreshold != 5) & threshMask[6];
+    threshMask[4] = (intThreshold != 4) & threshMask[5];
+    threshMask[3] = (intThreshold != 3) & threshMask[4];
+    threshMask[2] = (intThreshold != 2) & threshMask[3];
+    threshMask[1] = (intThreshold != 1) & threshMask[2];
  end
  // is the max priority > threshold?
  // *** would it be any better to first priority encode maxPriority into binary and then ">" with threshold?
--- a/wally-pipelined/src/uncore/uartPC16550D.sv
+++ b/wally-pipelined/src/uncore/uartPC16550D.sv
@ -291,7 +291,7 @@ module uartPC16550D(
  // although rxfullbit looks like a combinational loop, in one bit rxfifotail == i and breaks the loop
  generate
    genvar i;
-    for (i=0; i<16; i++) begin
+    for (i=0; i<16; i++) begin:rx
      assign RXerrbit[i] = |rxfifo[i][10:8]; // are any of the error conditions set?
      if (i > 0)
        assign rxfullbit[i] = ((rxfifohead==i) | rxfullbit[i-1]) & (rxfifotail != i);
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@ -159,7 +159,7 @@ module wallypipelinedhart

  // IEU vs HPTW arbitration signals to send to LSU
  logic [1:0] 		    MemRWMtoLSU;
-  logic [2:0] 		    Funct3MtoLSU;
+  logic [2:0] 		    SizeToLSU;
  logic [1:0] 		    AtomicMtoLSU;
  logic [`XLEN-1:0] 	    MemAdrMtoLSU;
  logic [`XLEN-1:0] 	    WriteDataMtoLSU;
@ -169,7 +169,7 @@ module wallypipelinedhart
  logic 		    DataMisalignedMfromLSU;
  logic 		    StallWtoLSU;
  logic 		    StallWfromLSU;  
-  logic [2:0] 		    Funct3MfromLSU;
+  logic [2:0] 		    SizeFromLSU;

  
  ifu ifu(.InstrInF(InstrRData),
@ -207,7 +207,7 @@ module wallypipelinedhart
 	  .AtomicMaskedM(AtomicMaskedM),
 	  .MemAckW(MemAckW),
 	  .HRDATAW(HRDATAW),
-	  .Funct3MfromLSU(Funct3MfromLSU),           // stays the same
+	  .SizeFromLSU(SizeFromLSU),           // stays the same
 	  .StallWfromLSU(StallWfromLSU),             // stays the same
 	  .DSquashBusAccessM(DSquashBusAccessM),     // probalby removed after dcache implemenation?
 	  // currently not connected (but will need to be used for lsu talking to ahb.
@ -261,7 +261,7 @@ module wallypipelinedhart
 	       //.InstrRData(InstrF), // hook up InstrF later
 	       .ISquashBusAccessF(1'b0), // *** temporary hack to disable PMP instruction fetch checking
 	       .WriteDataM(WriteDataM),
-	       .MemSizeM(Funct3MfromLSU[1:0]), .UnsignedLoadM(Funct3MfromLSU[2]),
+	       .MemSizeM(SizeFromLSU[1:0]), .UnsignedLoadM(SizeFromLSU[2]),
 	       .Funct7M(InstrM[31:25]),
 	       .HRDATAW(HRDATAW),
 	       .StallW(StallWfromLSU),
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -514,6 +514,9 @@ string tests32f[] = '{
  logic             HMASTLOCK;
  logic             HCLK, HRESETn;
  logic [`XLEN-1:0] PCW;
+
+  logic [`XLEN-1:0] debug;
+  assign debug = dut.uncore.dtim.RAM[536872960];
  
  flopenr #(`XLEN) PCWReg(clk, reset, ~dut.hart.ieu.dp.StallW, dut.hart.ifu.PCM, PCW);
  flopenr  #(32)   InstrWReg(clk, reset, ~dut.hart.ieu.dp.StallW,  dut.hart.ifu.InstrM, InstrW);
@ -656,10 +659,7 @@ string tests32f[] = '{
        // Check errors
        errors = (i == SIGNATURESIZE+1); // error if file is empty
        i = 0;
-        if (`XLEN == 32)
-          testadr = (`TIM_BASE+tests[test+1].atohex())/4;
-        else
-          testadr = (`TIM_BASE+tests[test+1].atohex())/8;
+        testadr = (`TIM_BASE+tests[test+1].atohex())/(`XLEN/8);
        /* verilator lint_off INFINITELOOP */
        while (signature[i] !== 'bx) begin
          //$display("signature[%h] = %h", i, signature[i]);
@ -669,14 +669,16 @@ string tests32f[] = '{
              // kind of hacky test for garbage right now
              errors = errors+1;
              $display("  Error on test %s result %d: adr = %h sim = %h, signature = %h", 
-                    tests[test], i, (testadr+i)*`XLEN/8, dut.uncore.dtim.RAM[testadr+i], signature[i]);
+                    tests[test], i, (testadr+i)*(`XLEN/8), dut.uncore.dtim.RAM[testadr+i], signature[i]);
              $stop;//***debug
            end
          end
          i = i + 1;
        end
        /* verilator lint_on INFINITELOOP */
-        if (errors == 0) $display("%s succeeded.  Brilliant!!!", tests[test]);
+        if (errors == 0) begin
+          $display("%s succeeded.  Brilliant!!!", tests[test]);
+        end
        else begin
          $display("%s failed with %d errors. :(", tests[test], errors);
          totalerrors = totalerrors+1;