Moved generate statements for optional units into wallypipelinedhart

2025-02-11 06:05:49 +00:00 · 2021-12-19 16:53:41 -08:00 · 2021-12-19 16:53:41 -08:00 · 3c3bfd055e
commit 3c3bfd055e
parent 53cd2ac049
12 changed files with 424 additions and 467 deletions
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -44,258 +44,244 @@ module fpu (
  output logic [4:0] 	   SetFflagsM        // FPU flags (to privileged unit)
  );
-  //*** make everything FLEN at some point
+   //*** make everything FLEN at some point
-  //*** add the 128 bit support to the if statement when needed
+   //*** add the 128 bit support to the if statement when needed
-  //*** make new tests for fp using testfloat that include flag checking and all rounding modes
+   //*** make new tests for fp using testfloat that include flag checking and all rounding modes
-  //*** what is the format for 16-bit - finding conflicting info online can't find anything specified in spec
+   //*** what is the format for 16-bit - finding conflicting info online can't find anything specified in spec
-  //*** only fma/mul and fp <-> int convert flags have been tested. test the others.
+   //*** only fma/mul and fp <-> int convert flags have been tested. test the others.
-  // FPU specifics:
+   // FPU specifics:
-  //    - uses NaN-blocking format
+   //    - uses NaN-blocking format
-  //        - if there are any unsused bits the most significant bits are filled with 1s
+   //        - if there are any unsused bits the most significant bits are filled with 1s
-  //                single stored in a double: | 32 1s | single precision value |
+   //                single stored in a double: | 32 1s | single precision value |
-  //    - sets the underflow after rounding
+   //    - sets the underflow after rounding
-  generate if (`F_SUPPORTED | `D_SUPPORTED) begin : fpu
+   // control signals
   logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
   logic [2:0] 	  FrmD, FrmE, FrmM;                   // FP rounding mode
   logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
   logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
   logic 		  FWriteIntD;                         // Write to integer register
   logic [1:0] 	  FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
   logic [1:0] 	  FResultSelD, FResultSelE;           // Select the result written to FP register
   logic [1:0] 	  FResultSelM, FResultSelW;           // Select the result written to FP register
   logic [2:0] 	  FOpCtrlD, FOpCtrlE;       // Select which opperation to do in each component
   logic [2:0] 	  FResSelD, FResSelE;       // Select one of the results that finish in the memory stage
   logic [1:0] 	  FIntResSelD, FIntResSelE;           // Select the result written to the integer resister
   logic [4:0] 	  Adr1E, Adr2E, Adr3E;                // adresses of each input
-     // control signals
+   // regfile signals
-     logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
+   logic [63:0] 	  FRD1D, FRD2D, FRD3D;                // Read Data from FP register - decode stage
-     logic [2:0] 	  FrmD, FrmE, FrmM;                   // FP rounding mode
+   logic [63:0] 	  FRD1E, FRD2E, FRD3E;                // Read Data from FP register - execute stage
-     logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
+   logic [63:0] 	  FSrcXE;                             // Input 1 to the various units (after forwarding)
-     logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
+   logic [63:0] 	  FPreSrcYE, FSrcYE;                  // Input 2 to the various units (after forwarding)
-     logic 		  FWriteIntD;                         // Write to integer register
+   logic [63:0] 	  FPreSrcZE, FSrcZE;                  // Input 3 to the various units (after forwarding)
     logic [1:0] 	  FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
     logic [1:0] 	  FResultSelD, FResultSelE;           // Select the result written to FP register
     logic [1:0] 	  FResultSelM, FResultSelW;           // Select the result written to FP register
     logic [2:0] 	  FOpCtrlD, FOpCtrlE;       // Select which opperation to do in each component
     logic [2:0] 	  FResSelD, FResSelE;       // Select one of the results that finish in the memory stage
     logic [1:0] 	  FIntResSelD, FIntResSelE;           // Select the result written to the integer resister
     logic [4:0] 	  Adr1E, Adr2E, Adr3E;                // adresses of each input
     // regfile signals
     logic [63:0] 	  FRD1D, FRD2D, FRD3D;                // Read Data from FP register - decode stage
     logic [63:0] 	  FRD1E, FRD2E, FRD3E;                // Read Data from FP register - execute stage
     logic [63:0] 	  FSrcXE;                             // Input 1 to the various units (after forwarding)
     logic [63:0] 	  FPreSrcYE, FSrcYE;                  // Input 2 to the various units (after forwarding)
     logic [63:0] 	  FPreSrcZE, FSrcZE;                  // Input 3 to the various units (after forwarding)
     // unpacking signals
     logic 		  XSgnE, YSgnE, ZSgnE;                // input's sign - execute stage
     logic 		  XSgnM, YSgnM;                       // input's sign - memory stage
     logic [10:0] 	  XExpE, YExpE, ZExpE;                // input's exponent - execute stage
     logic [10:0] 	  XExpM, YExpM, ZExpM;                // input's exponent - memory stage
     logic [52:0] 	  XManE, YManE, ZManE;                // input's fraction - execute stage
     logic [52:0] 	  XManM, YManM, ZManM;                // input's fraction - memory stage
     logic [10:0] 	  BiasE;                              // bias based on precision (single=7f double=3ff)
     logic 		  XNaNE, YNaNE, ZNaNE;                // is the input a NaN - execute stage
     logic 		  XNaNM, YNaNM, ZNaNM;                // is the input a NaN - memory stage
     logic 		  XNaNQ, YNaNQ;                       // is the input a NaN - divide
     logic 		  XSNaNE, YSNaNE, ZSNaNE;             // is the input a signaling NaN - execute stage
     logic 		  XSNaNM, YSNaNM, ZSNaNM;             // is the input a signaling NaN - memory stage
     logic 		  XDenormE, YDenormE, ZDenormE;       // is the input denormalized
     logic 		  XZeroE, YZeroE, ZZeroE;             // is the input zero - execute stage
     logic 		  XZeroM, YZeroM, ZZeroM;             // is the input zero - memory stage
     logic 		  XZeroQ, YZeroQ;                     // is the input zero - divide
     logic 		  XInfE, YInfE, ZInfE;                // is the input infinity - execute stage
     logic 		  XInfM, YInfM, ZInfM;                // is the input infinity - memory stage
     logic 		  XInfQ, YInfQ;                       // is the input infinity - divide
     logic 		  XExpMaxE;                           // is the exponent all ones (max value)
     logic 		  XNormE;                             // is normal
     logic 		  FmtQ;
     logic 		  FOpCtrlQ;     
     // result and flag signals
     logic [63:0] 	  FDivResM, FDivResW;                 // divide/squareroot result
     logic [4:0] 	  FDivFlgM;                 // divide/squareroot flags  
     logic [63:0] 	  FMAResM, FMAResW;                   // FMA/multiply result
     logic [4:0] 	  FMAFlgM;                   // FMA/multiply result	
     logic [63:0] 	  ReadResW;                           // read result (load instruction)
     logic [63:0] 	  CvtFpResE;    // add/FP -> FP convert result
     logic [4:0] 	  CvtFpFlgE;    // add/FP -> FP convert flags
     logic [63:0] 	  CvtResE;                   // FP <-> int convert result
     logic [4:0] 	  CvtFlgE;                   // FP <-> int convert flags //*** trim this	
     logic [63:0] 	  ClassResE;               // classify result
     logic [63:0] 	  CmpResE;                   // compare result
     logic 		  CmpNVE;                     // compare invalid flag (Not Valid)     
     logic [63:0] 	  SgnResE;                   // sign injection result
     logic 		  SgnNVE;                     // sign injection invalid flag (Not Valid)     
     logic [63:0] 	  FResE, FResM, FResW;                // selected result that is ready in the memory stage
     logic [4:0] 	  FFlgE, FFlgM;                       // selected flag that is ready in the memory stage     
     logic [`XLEN-1:0] 	  FIntResE;     
     logic [63:0] 	  FPUResultW;                         // final FP result being written to the FP register     
     // other signals
     logic 		  FDivSqrtDoneE;                      // is divide done
     logic [63:0] 	  DivInput1E, DivInput2E;             // inputs to divide/squareroot unit
     logic 		  load_preload;                       // enable for FF on fpdivsqrt     
     logic [63:0] 	  AlignedSrcAE;                       // align SrcA to the floating point format
-     // DECODE STAGE
+   // unpacking signals
-     
+   logic 		  XSgnE, YSgnE, ZSgnE;                // input's sign - execute stage
-     // calculate FP control signals
+   logic 		  XSgnM, YSgnM;                       // input's sign - memory stage
-     fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
+   logic [10:0] 	  XExpE, YExpE, ZExpE;                // input's exponent - execute stage
-		  .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
+   logic [10:0] 	  XExpM, YExpM, ZExpM;                // input's exponent - memory stage
-		  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
+   logic [52:0] 	  XManE, YManE, ZManE;                // input's fraction - execute stage
-	
+   logic [52:0] 	  XManM, YManM, ZManM;                // input's fraction - memory stage
-     // FP register file
+   logic [10:0] 	  BiasE;                              // bias based on precision (single=7f double=3ff)
-     fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
+   logic 		  XNaNE, YNaNE, ZNaNE;                // is the input a NaN - execute stage
-			.a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), 
+   logic 		  XNaNM, YNaNM, ZNaNM;                // is the input a NaN - memory stage
-			.a4(RdW), .wd4(FPUResultW),
+   logic 		  XNaNQ, YNaNQ;                       // is the input a NaN - divide
-			.rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
+   logic 		  XSNaNE, YSNaNE, ZSNaNE;             // is the input a signaling NaN - execute stage
   logic 		  XSNaNM, YSNaNM, ZSNaNM;             // is the input a signaling NaN - memory stage
   logic 		  XDenormE, YDenormE, ZDenormE;       // is the input denormalized
   logic 		  XZeroE, YZeroE, ZZeroE;             // is the input zero - execute stage
   logic 		  XZeroM, YZeroM, ZZeroM;             // is the input zero - memory stage
   logic 		  XZeroQ, YZeroQ;                     // is the input zero - divide
   logic 		  XInfE, YInfE, ZInfE;                // is the input infinity - execute stage
   logic 		  XInfM, YInfM, ZInfM;                // is the input infinity - memory stage
   logic 		  XInfQ, YInfQ;                       // is the input infinity - divide
   logic 		  XExpMaxE;                           // is the exponent all ones (max value)
   logic 		  XNormE;                             // is normal
   logic 		  FmtQ;
   logic 		  FOpCtrlQ;     
-     // D/E pipeline registers
+   // result and flag signals
-     flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
+   logic [63:0] 	  FDivResM, FDivResW;                 // divide/squareroot result
-     flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
+   logic [4:0] 	  FDivFlgM;                 // divide/squareroot flags  
-     flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
+   logic [63:0] 	  FMAResM, FMAResW;                   // FMA/multiply result
-     flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+   logic [4:0] 	  FMAFlgM;                   // FMA/multiply result	
-                             {Adr1E, Adr2E, Adr3E});
+   logic [63:0] 	  ReadResW;                           // read result (load instruction)
-     flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+   logic [63:0] 	  CvtFpResE;    // add/FP -> FP convert result
-			       {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
+   logic [4:0] 	  CvtFpFlgE;    // add/FP -> FP convert flags
-			       {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
+   logic [63:0] 	  CvtResE;                   // FP <-> int convert result
   logic [4:0] 	  CvtFlgE;                   // FP <-> int convert flags //*** trim this	
   logic [63:0] 	  ClassResE;               // classify result
   logic [63:0] 	  CmpResE;                   // compare result
   logic 		  CmpNVE;                     // compare invalid flag (Not Valid)     
   logic [63:0] 	  SgnResE;                   // sign injection result
   logic 		  SgnNVE;                     // sign injection invalid flag (Not Valid)     
   logic [63:0] 	  FResE, FResM, FResW;                // selected result that is ready in the memory stage
   logic [4:0] 	  FFlgE, FFlgM;                       // selected flag that is ready in the memory stage     
   logic [`XLEN-1:0] 	  FIntResE;     
   logic [63:0] 	  FPUResultW;                         // final FP result being written to the FP register     
   // other signals
   logic 		  FDivSqrtDoneE;                      // is divide done
   logic [63:0] 	  DivInput1E, DivInput2E;             // inputs to divide/squareroot unit
   logic 		  load_preload;                       // enable for FF on fpdivsqrt     
   logic [63:0] 	  AlignedSrcAE;                       // align SrcA to the floating point format
-     // EXECUTION STAGE
+   // DECODE STAGE
     // Hazard unit for FPU  
     //    - determines if any forwarding or stalls are needed
     fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
                     .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
     // forwarding muxs
     mux3  #(64)  fxemux (FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
     mux3  #(64)  fyemux (FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
     mux3  #(64)  fzemux (FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
     mux3  #(64)  fyaddmux (FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
 			    {2'b0, {10{1'b1}}, 52'b0}, 
 			    {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01)}, 
 			    FSrcYE); // Force Z to be 0 for multiply instructions
     // Force Z to be 0 for multiply instructions     
     mux3  #(64)  fzmulmux (FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
     // unpacking unit
     //    - splits FP inputs into their various parts
     //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
     unpacking unpacking (.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
 			  .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
 			  .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
 			  .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
     // FMA
     //   - two stage FMA
     //   - execute stage - multiplication and addend shifting
     //   - memory stage  - addition and rounding
     //   - handles FMA and multiply instructions
     fma fma (.clk, .reset, .FlushM, .StallM, 
 	      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
 	      .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE,
 	      .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
 	      .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
 	      .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
 	      .FOpCtrlE,
 	      .FmtE, .FmtM, .FrmM, 
 	      .FMAFlgM, .FMAResM);
     // fpdivsqrt using Goldschmidt's iteration
     flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
 				.clear(FDivSqrtDoneE), .en(load_preload),
 				.reset(reset),  .clk(clk));
     flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
 			      .clear(FDivSqrtDoneE), .en(load_preload),
 			      .reset(reset),  .clk(clk));
     flopenrc #(8) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE, FmtE, FOpCtrlE[0]}), 
 			     .q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ, FmtQ, FOpCtrlQ}),
 			     .clear(FDivSqrtDoneE), .en(load_preload),
 			     .reset(reset),  .clk(clk));
     fpdiv_pipe fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlQ), 
 			  .reset, .clk(clk), .start(FDivStartE), .P(~FmtQ), .OvEn(1'b1), .UnEn(1'b1),
 			  .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ, .load_preload,
 			  .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
-     // convert from signle to double and vice versa
+   // calculate FP control signals
-     cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
+   fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
-     
+      .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
-     // compare unit
+      .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
     //    - computation is done in one stage
     //    - writes to FP file durring min/max instructions
     //    - other comparisons write a 1 or 0 to the integer register
     fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
 		.FSrcXE, .FSrcYE, .FOpCtrlE, 
 		.FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
 		.Invalid(CmpNVE), .CmpResE);
     // sign injection unit
     fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
 		.SgnNVE, .SgnResE);
     // classify
     fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
 			  .XSNaNE, .ClassResE);
-     // Convert
+   // FP register file
-     fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .ForwardedSrcAE, .FOpCtrlE, .FmtE, .FrmE,
+   fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
-		.CvtResE, .CvtFlgE);
+      .a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), 
-     
+      .a4(RdW), .wd4(FPUResultW),
-     // data to be stored in memory - to IEU
+      .rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
     //    - FP uses NaN-blocking format
     //        - if there are any unsused bits the most significant bits are filled with 1s
     assign FWriteDataE = FSrcYE[`XLEN-1:0];     
     // Align SrcA to MSB when single precicion
     mux2  #(64)  SrcAMux({{32{1'b1}}, ForwardedSrcAE[31:0]}, {{64-`XLEN{1'b1}}, ForwardedSrcAE}, FmtE, AlignedSrcAE);
     // select a result that may be written to the FP register
     mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
     mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
     // select the result that may be written to the integer register - to IEU
     mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], 
 			       CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
     // E/M pipe registers
-     // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
+   // D/E pipeline registers
-     flopenrc #(65) EMFpReg2 (clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
+   flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
-     flopenrc #(65) EMFpReg3 (clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
+   flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
-     flopenrc #(64) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
+   flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-     flopenrc #(12) EMFpReg5 (clk, reset, FlushM, ~StallM, 
+   flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-			      {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
+                           {Adr1E, Adr2E, Adr3E});
-			      {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
+   flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-     flopenrc #(64) EMRegCmpRes (clk, reset, FlushM, ~StallM, FResE, FResM); 
+               {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
-     flopenrc #(5)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
+               {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
     flopenrc #(`XLEN) EMRegSgnRes (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
     flopenrc #(7) EMCtrlReg (clk, reset, FlushM, ~StallM,
 			       {FRegWriteE, FResultSelE, FrmE, FmtE},
 			       {FRegWriteM, FResultSelM, FrmM, FmtM});
     // BEGIN MEMORY STAGE
     // FPU flag selection - to privileged
     mux4  #(5)  FPUFlgMux (5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelM, SetFflagsM);
     // M/W pipe registers
     flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
     flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
     flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
     flopenrc #(4)  MWCtrlReg(clk, reset, FlushW, ~StallW,
 			      {FRegWriteM, FResultSelM, FmtM},
 			      {FRegWriteW, FResultSelW, FmtW});
     // BEGIN WRITEBACK STAGE
     // put ReadData into NaN-blocking format
     //    - if there are any unsused bits the most significant bits are filled with 1s
     //    - for load instruction
     mux2  #(64)  ReadResMux ({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
     // select the result to be written to the FP register
     mux4  #(64)  FPUResultMux (ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
-  end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
+   // EXECUTION STAGE
-     assign FStallD = 0;
+   // Hazard unit for FPU  
-     assign FWriteIntE = 0; 
+   //    - determines if any forwarding or stalls are needed
-     assign FWriteDataE = 0;
+   fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
-     assign FIntResM = 0;
+                  .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
-     assign FDivBusyE = 0;
+
-     assign IllegalFPUInstrD = 1;
+   // forwarding muxs
-     assign SetFflagsM = 0;
+   mux3  #(64)  fxemux (FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
-  end
+   mux3  #(64)  fyemux (FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
-  endgenerate 
+   mux3  #(64)  fzemux (FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
-   
+   mux3  #(64)  fyaddmux (FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
            {2'b0, {10{1'b1}}, 52'b0}, 
            {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01)}, 
            FSrcYE); // Force Z to be 0 for multiply instructions
   // Force Z to be 0 for multiply instructions     
   mux3  #(64)  fzmulmux (FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
   // unpacking unit
   //    - splits FP inputs into their various parts
   //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
   unpacking unpacking (.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
         .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
         .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
         .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
   // FMA
   //   - two stage FMA
   //   - execute stage - multiplication and addend shifting
   //   - memory stage  - addition and rounding
   //   - handles FMA and multiply instructions
   fma fma (.clk, .reset, .FlushM, .StallM, 
      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
      .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE,
      .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
      .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
      .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
      .FOpCtrlE,
      .FmtE, .FmtM, .FrmM, 
      .FMAFlgM, .FMAResM);
   // fpdivsqrt using Goldschmidt's iteration
   flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
         .clear(FDivSqrtDoneE), .en(load_preload),
         .reset(reset),  .clk(clk));
   flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
            .clear(FDivSqrtDoneE), .en(load_preload),
            .reset(reset),  .clk(clk));
   flopenrc #(8) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE, FmtE, FOpCtrlE[0]}), 
            .q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ, FmtQ, FOpCtrlQ}),
            .clear(FDivSqrtDoneE), .en(load_preload),
            .reset(reset),  .clk(clk));
   fpdiv_pipe fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlQ), 
         .reset, .clk(clk), .start(FDivStartE), .P(~FmtQ), .OvEn(1'b1), .UnEn(1'b1),
         .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ, .load_preload,
         .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
   // convert from signle to double and vice versa
   cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
   // compare unit
   //    - computation is done in one stage
   //    - writes to FP file durring min/max instructions
   //    - other comparisons write a 1 or 0 to the integer register
   fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
   .FSrcXE, .FSrcYE, .FOpCtrlE, 
   .FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
   .Invalid(CmpNVE), .CmpResE);
   // sign injection unit
   fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
   .SgnNVE, .SgnResE);
   // classify
   fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
         .XSNaNE, .ClassResE);
   // Convert
   fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .ForwardedSrcAE, .FOpCtrlE, .FmtE, .FrmE,
   .CvtResE, .CvtFlgE);
   // data to be stored in memory - to IEU
   //    - FP uses NaN-blocking format
   //        - if there are any unsused bits the most significant bits are filled with 1s
   assign FWriteDataE = FSrcYE[`XLEN-1:0];     
   // Align SrcA to MSB when single precicion
   mux2  #(64)  SrcAMux({{32{1'b1}}, ForwardedSrcAE[31:0]}, {{64-`XLEN{1'b1}}, ForwardedSrcAE}, FmtE, AlignedSrcAE);
   // select a result that may be written to the FP register
   mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
   mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
   // select the result that may be written to the integer register - to IEU
   mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], 
               CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
   // E/M pipe registers
   // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
   flopenrc #(65) EMFpReg2 (clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
   flopenrc #(65) EMFpReg3 (clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
   flopenrc #(64) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
   flopenrc #(12) EMFpReg5 (clk, reset, FlushM, ~StallM, 
            {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
            {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
   flopenrc #(64) EMRegCmpRes (clk, reset, FlushM, ~StallM, FResE, FResM); 
   flopenrc #(5)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
   flopenrc #(`XLEN) EMRegSgnRes (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
   flopenrc #(7) EMCtrlReg (clk, reset, FlushM, ~StallM,
               {FRegWriteE, FResultSelE, FrmE, FmtE},
               {FRegWriteM, FResultSelM, FrmM, FmtM});
   // BEGIN MEMORY STAGE
   // FPU flag selection - to privileged
   mux4  #(5)  FPUFlgMux (5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelM, SetFflagsM);
   // M/W pipe registers
   flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
   flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
   flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
   flopenrc #(4)  MWCtrlReg(clk, reset, FlushW, ~StallW,
            {FRegWriteM, FResultSelM, FmtM},
            {FRegWriteW, FResultSelW, FmtW});
   // BEGIN WRITEBACK STAGE
   // put ReadData into NaN-blocking format
   //    - if there are any unsused bits the most significant bits are filled with 1s
   //    - for load instruction
   mux2  #(64)  ReadResMux ({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
   // select the result to be written to the FP register
   mux4  #(64)  FPUResultMux (ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
 endmodule // fpu
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@ -40,65 +40,50 @@ module muldiv (
 	       input logic 		StallM, StallW, FlushM, FlushW 
 	       );
-   generate
+	logic [`XLEN-1:0] MulDivResultM;
-      if (`M_SUPPORTED) begin
+	logic [`XLEN-1:0] PrelimResultM;
-	 logic [`XLEN-1:0] MulDivResultM;
+	logic [`XLEN-1:0] QuotM, RemM;
-	 logic [`XLEN-1:0] PrelimResultM;
+	logic [`XLEN*2-1:0] ProdM; 
 	 logic [`XLEN-1:0] QuotM, RemM;
 	 logic [`XLEN*2-1:0] ProdM; 
-	 logic 		     DivE;
+	logic 		     DivE;
-	 logic 		     DivSignedE;	
+	logic 		     DivSignedE;	
-	 logic           W64M; 
+	logic           W64M; 
 	 // Multiplier
 	 mul mul(
 	 .clk, .reset,
  	 .StallM, .FlushM,
 	    // .SrcAE, .SrcBE,
 	 .ForwardedSrcAE, .ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
 	 .Funct3E,
  	 .ProdM
 	 );
-	 // Divide
+	// Multiplier
-	 // Start a divide when a new division instruction is received and the divider isn't already busy or finishing
+	mul mul(.clk, .reset, .StallM, .FlushM, .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .ProdM);
 	 assign DivE = MulDivE & Funct3E[2];
 	 assign DivSignedE = ~Funct3E[0];
 	 intdivrestoring div(.clk, .reset, .StallM,
 	   .DivSignedE, .W64E, .DivE, .ForwardedSrcAE, .ForwardedSrcBE, .DivBusyE, .QuotM, .RemM);
 	 // Result multiplexer
 	 always_comb
           case (Funct3M)	   
             3'b000: PrelimResultM = ProdM[`XLEN-1:0];
             3'b001: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
             3'b010: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
             3'b011: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
             3'b100: PrelimResultM = QuotM;
             3'b101: PrelimResultM = QuotM;
             3'b110: PrelimResultM = RemM;
             3'b111: PrelimResultM = RemM;
           endcase 
 	 // Handle sign extension for W-type instructions
 	 flopenrc #(1) W64MReg(clk, reset, FlushM, ~StallM, W64E, W64M);
 	 if (`XLEN == 64) begin // RV64 has W-type instructions
            assign MulDivResultM = W64M ? {{32{PrelimResultM[31]}}, PrelimResultM[31:0]} : PrelimResultM;
 	 end else begin // RV32 has no W-type instructions
            assign MulDivResultM = PrelimResultM;
 	 end
-     // Writeback stage pipeline register
+	// Divide
 	// Start a divide when a new division instruction is received and the divider isn't already busy or finishing
 	assign DivE = MulDivE & Funct3E[2];
 	assign DivSignedE = ~Funct3E[0];
 	intdivrestoring div(.clk, .reset, .StallM, .DivSignedE, .W64E, .DivE, 
 	                    .ForwardedSrcAE, .ForwardedSrcBE, .DivBusyE, .QuotM, .RemM);
 	// Result multiplexer
 	always_comb
 		case (Funct3M)	   
 			3'b000: PrelimResultM = ProdM[`XLEN-1:0];
 			3'b001: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
 			3'b010: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
 			3'b011: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
 			3'b100: PrelimResultM = QuotM;
 			3'b101: PrelimResultM = QuotM;
 			3'b110: PrelimResultM = RemM;
 			3'b111: PrelimResultM = RemM;
 		endcase 
-	 flopenrc #(`XLEN) MulDivResultWReg(clk, reset, FlushW, ~StallW, MulDivResultM, MulDivResultW);	 
+	// Handle sign extension for W-type instructions
-
+	flopenrc #(1) W64MReg(clk, reset, FlushM, ~StallM, W64E, W64M);
-      end else begin // no M instructions supported
+	generate
-	 	assign MulDivResultW = 0; 
+		if (`XLEN == 64) begin:resmux // RV64 has W-type instructions
-		assign DivBusyE = 0;
+			assign MulDivResultM = W64M ? {{32{PrelimResultM[31]}}, PrelimResultM[31:0]} : PrelimResultM;
-      end
+		end else begin:resmux // RV32 has no W-type instructions
-   endgenerate
+			assign MulDivResultM = PrelimResultM;
 		end
 	endgenerate
 	// Writeback stage pipeline register
 	flopenrc #(`XLEN) MulDivResultWReg(clk, reset, FlushW, ~StallW, MulDivResultM, MulDivResultW);	 
 endmodule // muldiv
--- a/wally-pipelined/src/privileged/csr.sv
+++ b/wally-pipelined/src/privileged/csr.sv
@ -86,76 +86,48 @@ module csr #(parameter
  logic        IllegalCSRCAccessM, IllegalCSRMAccessM, IllegalCSRSAccessM, IllegalCSRUAccessM, IllegalCSRNAccessM, InsufficientCSRPrivilegeM;
  logic IllegalCSRMWriteReadonlyM;
-  generate
+  // modify CSRs
-    if (`ZICSR_SUPPORTED) begin
+  always_comb begin
-      // modify CSRs
+    // Choose either rs1 or uimm[4:0] as source
-      always_comb begin
+    CSRSrcM = InstrM[14] ? {{(`XLEN-5){1'b0}}, InstrM[19:15]} : SrcAM;
-        // Choose either rs1 or uimm[4:0] as source
+    // Compute AND/OR modification
-        CSRSrcM = InstrM[14] ? {{(`XLEN-5){1'b0}}, InstrM[19:15]} : SrcAM;
+    CSRRWM = CSRSrcM;
-        // Compute AND/OR modification
+    CSRRSM = CSRReadValM | CSRSrcM;
-        CSRRWM = CSRSrcM;
+    CSRRCM = CSRReadValM & ~CSRSrcM;
-        CSRRSM = CSRReadValM | CSRSrcM;
+    case (InstrM[13:12])
-        CSRRCM = CSRReadValM & ~CSRSrcM;
+      2'b01:  CSRWriteValM = CSRRWM;
-        case (InstrM[13:12])
+      2'b10:  CSRWriteValM = CSRRSM;
-          2'b01:  CSRWriteValM = CSRRWM;
+      2'b11:  CSRWriteValM = CSRRCM;
-          2'b10:  CSRWriteValM = CSRRSM;
+      default: CSRWriteValM = CSRReadValM;
-          2'b11:  CSRWriteValM = CSRRCM;
+    endcase
-          default: CSRWriteValM = CSRReadValM;
+  end
        endcase
      end
-      // write CSRs
+  // write CSRs
-      assign CSRAdrM = InstrM[31:20];
+  assign CSRAdrM = InstrM[31:20];
-      assign UnalignedNextEPCM = TrapM ? PCM : CSRWriteValM;
+  assign UnalignedNextEPCM = TrapM ? PCM : CSRWriteValM;
-      assign NextEPCM = `C_SUPPORTED ? {UnalignedNextEPCM[`XLEN-1:1], 1'b0} : {UnalignedNextEPCM[`XLEN-1:2], 2'b00}; // 3.1.15 alignment
+  assign NextEPCM = `C_SUPPORTED ? {UnalignedNextEPCM[`XLEN-1:1], 1'b0} : {UnalignedNextEPCM[`XLEN-1:2], 2'b00}; // 3.1.15 alignment
-      assign NextCauseM = TrapM ? CauseM : CSRWriteValM;
+  assign NextCauseM = TrapM ? CauseM : CSRWriteValM;
-      assign NextMtvalM = TrapM ? NextFaultMtvalM : CSRWriteValM;
+  assign NextMtvalM = TrapM ? NextFaultMtvalM : CSRWriteValM;
-      assign CSRMWriteM = CSRWriteM && (PrivilegeModeW == `M_MODE);
+  assign CSRMWriteM = CSRWriteM && (PrivilegeModeW == `M_MODE);
-      assign CSRSWriteM = CSRWriteM && (|PrivilegeModeW);
+  assign CSRSWriteM = CSRWriteM && (|PrivilegeModeW);
-      assign CSRUWriteM = CSRWriteM;  
+  assign CSRUWriteM = CSRWriteM;  
-      csri  csri(.*);
+  csri  csri(.*);
-      csrsr csrsr(.*);
+  csrsr csrsr(.*);
-      csrc  counters(.*);
+  csrc  counters(.*);
-      csrm  csrm(.*); // Machine Mode CSRs
+  csrm  csrm(.*); // Machine Mode CSRs
-      csrs  csrs(.*);
+  csrs  csrs(.*);
-      csrn  csrn(.CSRNWriteM(CSRUWriteM), .*);  // User Mode Exception Registers
+  csrn  csrn(.CSRNWriteM(CSRUWriteM), .*);  // User Mode Exception Registers
-      csru  csru(.*); // Floating Point Flags are part of User MOde
+  csru  csru(.*); // Floating Point Flags are part of User MOde
-      // merge CSR Reads
+  // merge CSR Reads
-      assign CSRReadValM = CSRUReadValM | CSRSReadValM | CSRMReadValM | CSRCReadValM | CSRNReadValM; 
+  assign CSRReadValM = CSRUReadValM | CSRSReadValM | CSRMReadValM | CSRCReadValM | CSRNReadValM; 
-      // *** add W stall 2/22/21 dh to try fixing memory stalls
+  flopenrc #(`XLEN) CSRValWReg(clk, reset, FlushW, ~StallW, CSRReadValM, CSRReadValW);
 //      floprc #(`XLEN) CSRValWReg(clk, reset, FlushW, CSRReadValM, CSRReadValW);
      flopenrc #(`XLEN) CSRValWReg(clk, reset, FlushW, ~StallW, CSRReadValM, CSRReadValW);
-      // merge illegal accesses: illegal if none of the CSR addresses is legal or privilege is insufficient
+  // merge illegal accesses: illegal if none of the CSR addresses is legal or privilege is insufficient
-      assign InsufficientCSRPrivilegeM = (CSRAdrM[9:8] == 2'b11 && PrivilegeModeW != `M_MODE) ||
+  assign InsufficientCSRPrivilegeM = (CSRAdrM[9:8] == 2'b11 && PrivilegeModeW != `M_MODE) ||
-                                        (CSRAdrM[9:8] == 2'b01 && PrivilegeModeW == `U_MODE);
+                                    (CSRAdrM[9:8] == 2'b01 && PrivilegeModeW == `U_MODE);
-      assign IllegalCSRAccessM = ((IllegalCSRCAccessM && IllegalCSRMAccessM && 
+  assign IllegalCSRAccessM = ((IllegalCSRCAccessM && IllegalCSRMAccessM && 
-        IllegalCSRSAccessM && IllegalCSRUAccessM  && IllegalCSRNAccessM ||
+    IllegalCSRSAccessM && IllegalCSRUAccessM  && IllegalCSRNAccessM ||
-        InsufficientCSRPrivilegeM) && CSRReadM) || IllegalCSRMWriteReadonlyM;
+    InsufficientCSRPrivilegeM) && CSRReadM) || IllegalCSRMWriteReadonlyM;
    end else begin // CSRs not implemented
      assign STATUS_MPP = 2'b11;
      assign STATUS_SPP = 2'b0;
      assign STATUS_TSR = 0;
      assign MEPC_REGW = 0;
      assign SEPC_REGW = 0;
      assign UEPC_REGW = 0;
      assign UTVEC_REGW = 0;
      assign STVEC_REGW = 0;
      assign MTVEC_REGW = 0;
      assign MEDELEG_REGW = 0;
      assign MIDELEG_REGW = 0;
      assign SEDELEG_REGW = 0;
      assign SIDELEG_REGW = 0;
      assign SATP_REGW = 0;
      assign MIP_REGW = 0;
      assign MIE_REGW = 0;
      assign STATUS_MIE = 0;
      assign STATUS_SIE = 0;
      assign FRM_REGW = 0;
      assign CSRReadValM = 0;
      assign IllegalCSRAccessM = CSRReadM;
    end
  endgenerate
 endmodule
--- a/wally-pipelined/src/privileged/privileged.sv
+++ b/wally-pipelined/src/privileged/privileged.sv
@ -239,8 +239,6 @@ module privileged (
            .ExceptionM,
            .PendingInterruptM,
            .PrivilegedNextPCM, .CauseM, .NextFaultMtvalM);
 endmodule
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@ -198,7 +198,6 @@ module wallypipelinedhart (
 	  ); // instruction fetch unit: PC, branch prediction, instruction cache
  ieu ieu(
     .clk, .reset,
@ -276,7 +275,7 @@ module wallypipelinedhart (
 	.LSUStall);                     // change to LSUStall
-  
+   // *** Ross: please make EBU conditional when only supporting internal memories
  ahblite ebu(// IFU connections
     .clk, .reset,
@ -295,22 +294,7 @@ module wallypipelinedhart (
     .HWRITED);
-  muldiv mdu(
+   hazard     hzu(
     .clk, .reset,
 	// Execute Stage interface
 	//   .SrcAE, .SrcBE,
 	.ForwardedSrcAE, .ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
 	.Funct3E, .Funct3M,
     .MulDivE, .W64E,
 	// Writeback stage
     .MulDivResultW,
     // Divide Done
 	.DivBusyE, 
 	// hazards
 	.StallM, .StallW, .FlushM, .FlushW 
  ); // multiply and divide unit
  hazard     hzu(
     .BPPredWrongE, .CSRWritePendingDEM, .RetM, .TrapM,
     .LoadStallD, .StoreStallD, .MulDivStallD, .CSRRdStallD,
     .LSUStall, .ICacheStallF,
@ -323,57 +307,89 @@ module wallypipelinedhart (
 	.FlushF, .FlushD, .FlushE, .FlushM, .FlushW
     );	// global stall and flush control
-  // Priveleged block operates in M and W stages, handling CSRs and exceptions
+   generate
-  privileged priv(
+      if (`ZICSR_SUPPORTED) begin:priv
-     .clk, .reset,
+         privileged priv(
-     .FlushD, .FlushE, .FlushM, .FlushW, 
+            .clk, .reset,
-     .StallD, .StallE, .StallM, .StallW,
+            .FlushD, .FlushE, .FlushM, .FlushW, 
-     .CSRReadM, .CSRWriteM, .SrcAM, .PCM,
+            .StallD, .StallE, .StallM, .StallW,
-     .InstrM, .CSRReadValW, .PrivilegedNextPCM,
+            .CSRReadM, .CSRWriteM, .SrcAM, .PCM,
-     .RetM, .TrapM, 
+            .InstrM, .CSRReadValW, .PrivilegedNextPCM,
-     .ITLBFlushF, .DTLBFlushM,
+            .RetM, .TrapM, 
-     .InstrValidM, .CommittedM,
+            .ITLBFlushF, .DTLBFlushM,
-     .FRegWriteM, .LoadStallD,
+            .InstrValidM, .CommittedM,
-     .BPPredDirWrongM, .BTBPredPCWrongM,
+            .FRegWriteM, .LoadStallD,
-     .RASPredPCWrongM, .BPPredClassNonCFIWrongM,
+            .BPPredDirWrongM, .BTBPredPCWrongM,
-     .InstrClassM, .DCacheMiss, .DCacheAccess, .PrivilegedM,
+            .RASPredPCWrongM, .BPPredClassNonCFIWrongM,
-     .ITLBInstrPageFaultF, .DTLBLoadPageFaultM, .DTLBStorePageFaultM,
+            .InstrClassM, .DCacheMiss, .DCacheAccess, .PrivilegedM,
-     .WalkerInstrPageFaultF, .WalkerLoadPageFaultM, .WalkerStorePageFaultM,
+            .ITLBInstrPageFaultF, .DTLBLoadPageFaultM, .DTLBStorePageFaultM,
-     .InstrMisalignedFaultM, .IllegalIEUInstrFaultD, .IllegalFPUInstrD,
+            .WalkerInstrPageFaultF, .WalkerLoadPageFaultM, .WalkerStorePageFaultM,
-     .LoadMisalignedFaultM, .StoreMisalignedFaultM,
+            .InstrMisalignedFaultM, .IllegalIEUInstrFaultD, .IllegalFPUInstrD,
-     .TimerIntM, .ExtIntM, .SwIntM,
+            .LoadMisalignedFaultM, .StoreMisalignedFaultM,
-     .MTIME_CLINT, .MTIMECMP_CLINT,
+            .TimerIntM, .ExtIntM, .SwIntM,
-     .InstrMisalignedAdrM, .MemAdrM,
+            .MTIME_CLINT, .MTIMECMP_CLINT,
-     .SetFflagsM,
+            .InstrMisalignedAdrM, .MemAdrM,
-     // Trap signals from pmp/pma in mmu
+            .SetFflagsM,
-     // *** do these need to be split up into one for dmem and one for ifu?
+            // Trap signals from pmp/pma in mmu
-     // instead, could we only care about the instr and F pins that come from ifu and only care about the load/store and m pins that come from dmem?
+            // *** do these need to be split up into one for dmem and one for ifu?
-     .InstrAccessFaultF, .LoadAccessFaultM, .StoreAccessFaultM,
+            // instead, could we only care about the instr and F pins that come from ifu and only care about the load/store and m pins that come from dmem?
-     .ExceptionM, .PendingInterruptM, .IllegalFPUInstrE,
+            .InstrAccessFaultF, .LoadAccessFaultM, .StoreAccessFaultM,
-     .PrivilegeModeW, .SATP_REGW,
+            .ExceptionM, .PendingInterruptM, .IllegalFPUInstrE,
-     .STATUS_MXR, .STATUS_SUM, .STATUS_MPRV, .STATUS_MPP,
+            .PrivilegeModeW, .SATP_REGW,
-     .PMPCFG_ARRAY_REGW, .PMPADDR_ARRAY_REGW, 
+            .STATUS_MXR, .STATUS_SUM, .STATUS_MPRV, .STATUS_MPP,
-     .FRM_REGW,.BreakpointFaultM, .EcallFaultM
+            .PMPCFG_ARRAY_REGW, .PMPADDR_ARRAY_REGW, 
-  );
+            .FRM_REGW,.BreakpointFaultM, .EcallFaultM
-  
+         );
      end else begin
         assign CSRReadValW = 0;
         assign PrivilegedNextPCM = 0;
         assign RetM = 0;
         assign TrapM = 0;
         assign ITLBFlushF = 0;
         assign DTLBFlushM = 0;
      end
      if (`M_SUPPORTED) begin:mdu
         muldiv mdu(
            .clk, .reset,
            .ForwardedSrcAE, .ForwardedSrcBE, 
            .Funct3E, .Funct3M, .MulDivE, .W64E,
            .MulDivResultW, .DivBusyE, 
            .StallM, .StallW, .FlushM, .FlushW 
         ); 
      end else begin // no M instructions supported
         assign MulDivResultW = 0; 
         assign DivBusyE = 0;
      end
-  fpu fpu(
+      if (`F_SUPPORTED) begin:fpu
-     .clk, .reset,
+         fpu fpu(
-     .FRM_REGW, // Rounding mode from CSR
+            .clk, .reset,
-     .InstrD, // instruction from IFU
+            .FRM_REGW, // Rounding mode from CSR
-     .ReadDataW,// Read data from memory
+            .InstrD, // instruction from IFU
-     .ForwardedSrcAE, // Integer input being processed (from IEU)
+            .ReadDataW,// Read data from memory
-     .StallE, .StallM, .StallW, // stall signals from HZU
+            .ForwardedSrcAE, // Integer input being processed (from IEU)
-     .FlushE, .FlushM, .FlushW, // flush signals from HZU
+            .StallE, .StallM, .StallW, // stall signals from HZU
-     .RdM, .RdW, // which FP register to write to (from IEU)
+            .FlushE, .FlushM, .FlushW, // flush signals from HZU
-     .FRegWriteM, // FP register write enable
+            .RdM, .RdW, // which FP register to write to (from IEU)
-     .FStallD, // Stall the decode stage
+            .FRegWriteM, // FP register write enable
-     .FWriteIntE, // integer register write enable
+            .FStallD, // Stall the decode stage
-     .FWriteDataE, // Data to be written to memory
+            .FWriteIntE, // integer register write enable
-     .FIntResM, // data to be written to integer register
+            .FWriteDataE, // Data to be written to memory
-     .FDivBusyE, // Is the divide/sqrt unit busy (stall execute stage)
+            .FIntResM, // data to be written to integer register
-     .IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+            .FDivBusyE, // Is the divide/sqrt unit busy (stall execute stage)
-     .SetFflagsM        // FPU flags (to privileged unit)
+            .IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
-  ); // floating point unit
+            .SetFflagsM        // FPU flags (to privileged unit)
-  
+         ); // floating point unit
      end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
         assign FStallD = 0;
         assign FWriteIntE = 0; 
         assign FWriteDataE = 0;
         assign FIntResM = 0;
         assign FDivBusyE = 0;
         assign IllegalFPUInstrD = 1;
         assign SetFflagsM = 0;
      end
   endgenerate
  // Priveleged block operates in M and W stages, handling CSRs and exceptions  
 endmodule
--- a/wally-pipelined/testbench/sdc/ram2sdLoad.py
+++ b/wally-pipelined/testbench/sdc/ram2sdLoad.py
--- a/wally-pipelined/testbench/sdc/ramdisk2.hex
+++ b/wally-pipelined/testbench/sdc/ramdisk2.hex
--- a/wally-pipelined/testbench/sdc/run_tb.do
+++ b/wally-pipelined/testbench/sdc/run_tb.do
--- a/wally-pipelined/testbench/sdc/sd_top_tb.sv
+++ b/wally-pipelined/testbench/sdc/sd_top_tb.sv
--- a/wally-pipelined/testbench/sdc/wave.do
+++ b/wally-pipelined/testbench/sdc/wave.do
--- a/wally-pipelined/testbench/testbench-linux.sv
+++ b/wally-pipelined/testbench/testbench-linux.sv
@ -174,7 +174,7 @@ module testbench();
  // Useful Aliases
  `define RF          dut.hart.ieu.dp.regf.rf
  `define PC          dut.hart.ifu.pcreg.q
-  `define CSR_BASE    dut.hart.priv.csr.genblk1
+  `define CSR_BASE    dut.hart.priv.priv.csr
  `define HPMCOUNTER  `CSR_BASE.counters.genblk1.HPMCOUNTER_REGW
  `define PMP_BASE    `CSR_BASE.csrm.genblk4
  `define PMPCFG      genblk2.PMPCFGreg.q
@ -210,8 +210,8 @@ module testbench();
  `define STATUS_MIE  `CSR_BASE.csrsr.STATUS_MIE
  `define STATUS_SIE  `CSR_BASE.csrsr.STATUS_SIE
  `define STATUS_UIE  `CSR_BASE.csrsr.STATUS_UIE
-  `define PRIV        dut.hart.priv.privmodereg.q
+  `define PRIV        dut.hart.priv.priv.privmodereg.q
-  `define INSTRET     dut.hart.priv.csr.genblk1.counters.genblk1.genblk2.INSTRETreg.q
+  `define INSTRET     dut.hart.priv.priv.csr.counters.genblk1.genblk2.INSTRETreg.q
  // Common Macros
  `define checkCSR(CSR) \
    begin \
@ -308,9 +308,9 @@ module testbench();
  integer ramFile;
  integer readResult;
  initial begin
-    force dut.hart.priv.SwIntM = 0;
+    force dut.hart.priv.priv.SwIntM = 0;
-    force dut.hart.priv.TimerIntM = 0;
+    force dut.hart.priv.priv.TimerIntM = 0;
-    force dut.hart.priv.ExtIntM = 0;    
+    force dut.hart.priv.priv.ExtIntM = 0;    
    $readmemh({`LINUX_TEST_VECTORS,"bootmem.txt"}, dut.uncore.bootrom.bootrom.RAM, 'h1000 >> 3);
    $readmemb(`TWO_BIT_PRELOAD, dut.hart.ifu.bpred.bpred.Predictor.DirPredictor.PHT.mem);
    $readmemb(`BTB_PRELOAD, dut.hart.ifu.bpred.bpred.TargetPredictor.memory.mem);
@ -365,7 +365,7 @@ module testbench();
  // on the next falling edge the expected state is compared to the wally state.
  // step 0: read the expected state
-  assign checkInstrM = dut.hart.ieu.InstrValidM & ~dut.hart.priv.trap.InstrPageFaultM & ~dut.hart.priv.trap.InterruptM & ~dut.hart.StallM;
+  assign checkInstrM = dut.hart.ieu.InstrValidM & ~dut.hart.priv.priv.trap.InstrPageFaultM & ~dut.hart.priv.priv.trap.InterruptM & ~dut.hart.StallM;
  `define SCAN_NEW_INSTR_FROM_TRACE(STAGE) \
    // always check PC, instruction bits \
    if (checkInstrM) begin \
@ -479,7 +479,7 @@ module testbench();
      end else begin // update MIP immediately
        $display("%tns: Updating MIP to %x",$time,NextMIPexpected);
        MIPexpected = NextMIPexpected;
-        force dut.hart.priv.csr.genblk1.csri.MIP_REGW = MIPexpected;
+        force dut.hart.priv.priv.csr.csri.MIP_REGW = MIPexpected;
      end
      // $display("%tn: ExpectedCSRArrayM = %p",$time,ExpectedCSRArrayM);
      // $display("%tn: ExpectedCSRArrayValueM = %p",$time,ExpectedCSRArrayValueM);
@ -491,11 +491,11 @@ module testbench();
      // $display("%tn: ExpectedCSRArrayValueM[NumCSRM] %x",$time,ExpectedCSRArrayValueM[NumCSRM]);
    end
    if(RequestDelayedMIP & checkInstrM) begin
-      $display("%tns: Executing Delayed MIP. Current MEPC value is %x",$time,dut.hart.priv.csr.genblk1.csrm.MEPC_REGW);
+      $display("%tns: Executing Delayed MIP. Current MEPC value is %x",$time,dut.hart.priv.priv.csr.csrm.MEPC_REGW);
      $display("%tns: Updating MIP to %x",$time,NextMIPexpected);
      MIPexpected = NextMIPexpected;
-      force dut.hart.priv.csr.genblk1.csri.MIP_REGW = MIPexpected;
+      force dut.hart.priv.priv.csr.csri.MIP_REGW = MIPexpected;
-      $display("%tns: Finished Executing Delayed MIP. Current MEPC value is %x",$time,dut.hart.priv.csr.genblk1.csrm.MEPC_REGW);
+      $display("%tns: Finished Executing Delayed MIP. Current MEPC value is %x",$time,dut.hart.priv.priv.csr.csrm.MEPC_REGW);
      RequestDelayedMIP = 0;
    end
  end
@ -576,7 +576,7 @@ module testbench();
        `checkEQ("PCW",PCW,ExpectedPCW)
        //`checkEQ("InstrW",InstrW,ExpectedInstrW) <-- not viable because of
        // compressed to uncompressed conversion
-        `checkEQ("Instr Count",dut.hart.priv.csr.genblk1.counters.genblk1.INSTRET_REGW,InstrCountW)
+        `checkEQ("Instr Count",dut.hart.priv.priv.csr.counters.genblk1.INSTRET_REGW,InstrCountW)
        #2; // delay 2 ns.
        if(`DEBUG_TRACE >= 5) begin
          $display("%tns, %d instrs: Reg Write Address %02d ? expected value: %02d", $time, InstrCountW, dut.hart.ieu.dp.regf.a3, ExpectedRegAdrW);
@ -601,19 +601,19 @@ module testbench();
        // check csr
        for(NumCSRPostWIndex = 0; NumCSRPostWIndex < NumCSRW; NumCSRPostWIndex++) begin
          case(ExpectedCSRArrayW[NumCSRPostWIndex])
-            "mhartid": `checkCSR(dut.hart.priv.csr.genblk1.csrm.MHARTID_REGW)
+            "mhartid": `checkCSR(dut.hart.priv.priv.csr.csrm.MHARTID_REGW)
-            "mstatus": `checkCSR(dut.hart.priv.csr.genblk1.csrm.MSTATUS_REGW)
+            "mstatus": `checkCSR(dut.hart.priv.priv.csr.csrm.MSTATUS_REGW)
-            "mtvec":   `checkCSR(dut.hart.priv.csr.genblk1.csrm.MTVEC_REGW)
+            "mtvec":   `checkCSR(dut.hart.priv.priv.csr.csrm.MTVEC_REGW)
-            "mip":     `checkCSR(dut.hart.priv.csr.genblk1.csrm.MIP_REGW)
+            "mip":     `checkCSR(dut.hart.priv.priv.csr.csrm.MIP_REGW)
-            "mie":     `checkCSR(dut.hart.priv.csr.genblk1.csrm.MIE_REGW)
+            "mie":     `checkCSR(dut.hart.priv.priv.csr.csrm.MIE_REGW)
-            "mideleg": `checkCSR(dut.hart.priv.csr.genblk1.csrm.MIDELEG_REGW)
+            "mideleg": `checkCSR(dut.hart.priv.priv.csr.csrm.MIDELEG_REGW)
-            "medeleg": `checkCSR(dut.hart.priv.csr.genblk1.csrm.MEDELEG_REGW)
+            "medeleg": `checkCSR(dut.hart.priv.priv.csr.csrm.MEDELEG_REGW)
-            "mepc":    `checkCSR(dut.hart.priv.csr.genblk1.csrm.MEPC_REGW)
+            "mepc":    `checkCSR(dut.hart.priv.priv.csr.csrm.MEPC_REGW)
-            "mtval":   `checkCSR(dut.hart.priv.csr.genblk1.csrm.MTVAL_REGW)
+            "mtval":   `checkCSR(dut.hart.priv.priv.csr.csrm.MTVAL_REGW)
-            "sepc":    `checkCSR(dut.hart.priv.csr.genblk1.csrs.SEPC_REGW)
+            "sepc":    `checkCSR(dut.hart.priv.priv.csr.csrs.SEPC_REGW)
-            "scause":  `checkCSR(dut.hart.priv.csr.genblk1.csrs.genblk1.SCAUSE_REGW)
+            "scause":  `checkCSR(dut.hart.priv.priv.csr.csrs.genblk1.SCAUSE_REGW)
-            "stvec":   `checkCSR(dut.hart.priv.csr.genblk1.csrs.STVEC_REGW)
+            "stvec":   `checkCSR(dut.hart.priv.priv.csr.csrs.STVEC_REGW)
-            "stval":   `checkCSR(dut.hart.priv.csr.genblk1.csrs.genblk1.STVAL_REGW)
+            "stval":   `checkCSR(dut.hart.priv.priv.csr.csrs.genblk1.STVAL_REGW)
          endcase
        end
        if (fault == 1) begin
@ -667,7 +667,7 @@ module testbench();
    begin
      int i;
      // Grab the SATP register from privileged unit
-      SATP = dut.hart.priv.csr.SATP_REGW;
+      SATP = dut.hart.priv.priv.csr.SATP_REGW;
      // Split the virtual address into page number segments and offset
      VPN[2] = adrIn[38:30];
      VPN[1] = adrIn[29:21];
@ -677,7 +677,7 @@ module testbench();
      SvMode = SATP[63];
      // Only perform translation if translation is on and the processor is not
      // in machine mode
-      if (SvMode && (dut.hart.priv.PrivilegeModeW != `M_MODE)) begin
+      if (SvMode && (dut.hart.priv.priv.PrivilegeModeW != `M_MODE)) begin
        BaseAdr = SATP[43:0] << 12;
        for (i = 2; i >= 0; i--) begin
          PAdr = BaseAdr + (VPN[i] << 3);
--- a/wally-pipelined/testbench/testbench.sv
+++ b/wally-pipelined/testbench/testbench.sv
@ -287,7 +287,7 @@ logic [3:0] dummy;
  // Termination condition
  // terminate on a specific ECALL for Imperas tests, or on a jump to self infinite loop for RISC-V Arch tests
-  assign DCacheFlushStart = dut.hart.priv.EcallFaultM && 
+  assign DCacheFlushStart = dut.hart.priv.priv.EcallFaultM && 
 			    (dut.hart.ieu.dp.regf.rf[3] == 1 || 
 			     (dut.hart.ieu.dp.regf.we3 && 
 			      dut.hart.ieu.dp.regf.a3 == 3 && 
@ -318,7 +318,7 @@ module riscvassertions;
  initial begin
    assert (`PMP_ENTRIES == 0 || `PMP_ENTRIES==16 || `PMP_ENTRIES==64) else $error("Illegal number of PMP entries: PMP_ENTRIES must be 0, 16, or 64");
    assert (`DIV_BITSPERCYCLE == 1 || `DIV_BITSPERCYCLE==2 || `DIV_BITSPERCYCLE==4) else $error("Illegal number of divider bits/cycle: DIV_BITSPERCYCLE must be 1, 2, or 4");
-    assert (`F_SUPPORTED || ~`D_SUPPORTED) else $error("Can't support double without supporting float");
+    assert (`F_SUPPORTED || ~`D_SUPPORTED) else $error("Can't support double (D) without supporting float (F)");
    assert (`XLEN == 64 || ~`D_SUPPORTED) else $error("Wally does not yet support D extensions on RV32");
    assert (`DCACHE_WAYSIZEINBYTES <= 4096 || `MEM_DCACHE == 0 || `MEM_VIRTMEM == 0) else $error("DCACHE_WAYSIZEINBYTES cannot exceed 4 KiB when caches and vitual memory is enabled (to prevent aliasing)");
    assert (`DCACHE_BLOCKLENINBITS >= 128 || `MEM_DCACHE == 0) else $error("DCACHE_BLOCKLENINBITS must be at least 128 when caches are enabled");