Moved generate statements for optional units into wallypipelinedhart

2025-02-11 06:05:49 +00:00 · 2021-12-19 16:53:41 -08:00 · 2021-12-19 16:53:41 -08:00 · 1196e5c191
commit 1196e5c191
parent 5e1c3e322b
12 changed files with 424 additions and 467 deletions
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -44,258 +44,244 @@ module fpu (
  output logic [4:0] 	   SetFflagsM        // FPU flags (to privileged unit)
  );

-  //*** make everything FLEN at some point
-  //*** add the 128 bit support to the if statement when needed
-  //*** make new tests for fp using testfloat that include flag checking and all rounding modes
-  //*** what is the format for 16-bit - finding conflicting info online can't find anything specified in spec
-  //*** only fma/mul and fp <-> int convert flags have been tested. test the others.
+   //*** make everything FLEN at some point
+   //*** add the 128 bit support to the if statement when needed
+   //*** make new tests for fp using testfloat that include flag checking and all rounding modes
+   //*** what is the format for 16-bit - finding conflicting info online can't find anything specified in spec
+   //*** only fma/mul and fp <-> int convert flags have been tested. test the others.

-  // FPU specifics:
-  //    - uses NaN-blocking format
-  //        - if there are any unsused bits the most significant bits are filled with 1s
-  //                single stored in a double: | 32 1s | single precision value |
-  //    - sets the underflow after rounding
+   // FPU specifics:
+   //    - uses NaN-blocking format
+   //        - if there are any unsused bits the most significant bits are filled with 1s
+   //                single stored in a double: | 32 1s | single precision value |
+   //    - sets the underflow after rounding
  
-  generate if (`F_SUPPORTED | `D_SUPPORTED) begin : fpu
+   // control signals
+   logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
+   logic [2:0] 	  FrmD, FrmE, FrmM;                   // FP rounding mode
+   logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
+   logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
+   logic 		  FWriteIntD;                         // Write to integer register
+   logic [1:0] 	  FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
+   logic [1:0] 	  FResultSelD, FResultSelE;           // Select the result written to FP register
+   logic [1:0] 	  FResultSelM, FResultSelW;           // Select the result written to FP register
+   logic [2:0] 	  FOpCtrlD, FOpCtrlE;       // Select which opperation to do in each component
+   logic [2:0] 	  FResSelD, FResSelE;       // Select one of the results that finish in the memory stage
+   logic [1:0] 	  FIntResSelD, FIntResSelE;           // Select the result written to the integer resister
+   logic [4:0] 	  Adr1E, Adr2E, Adr3E;                // adresses of each input

-     // control signals
-     logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
-     logic [2:0] 	  FrmD, FrmE, FrmM;                   // FP rounding mode
-     logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
-     logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
-     logic 		  FWriteIntD;                         // Write to integer register
-     logic [1:0] 	  FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
-     logic [1:0] 	  FResultSelD, FResultSelE;           // Select the result written to FP register
-     logic [1:0] 	  FResultSelM, FResultSelW;           // Select the result written to FP register
-     logic [2:0] 	  FOpCtrlD, FOpCtrlE;       // Select which opperation to do in each component
-     logic [2:0] 	  FResSelD, FResSelE;       // Select one of the results that finish in the memory stage
-     logic [1:0] 	  FIntResSelD, FIntResSelE;           // Select the result written to the integer resister
-     logic [4:0] 	  Adr1E, Adr2E, Adr3E;                // adresses of each input
-     
-     // regfile signals
-     logic [63:0] 	  FRD1D, FRD2D, FRD3D;                // Read Data from FP register - decode stage
-     logic [63:0] 	  FRD1E, FRD2E, FRD3E;                // Read Data from FP register - execute stage
-     logic [63:0] 	  FSrcXE;                             // Input 1 to the various units (after forwarding)
-     logic [63:0] 	  FPreSrcYE, FSrcYE;                  // Input 2 to the various units (after forwarding)
-     logic [63:0] 	  FPreSrcZE, FSrcZE;                  // Input 3 to the various units (after forwarding)
-     
-     // unpacking signals
-     logic 		  XSgnE, YSgnE, ZSgnE;                // input's sign - execute stage
-     logic 		  XSgnM, YSgnM;                       // input's sign - memory stage
-     logic [10:0] 	  XExpE, YExpE, ZExpE;                // input's exponent - execute stage
-     logic [10:0] 	  XExpM, YExpM, ZExpM;                // input's exponent - memory stage
-     logic [52:0] 	  XManE, YManE, ZManE;                // input's fraction - execute stage
-     logic [52:0] 	  XManM, YManM, ZManM;                // input's fraction - memory stage
-     logic [10:0] 	  BiasE;                              // bias based on precision (single=7f double=3ff)
-     logic 		  XNaNE, YNaNE, ZNaNE;                // is the input a NaN - execute stage
-     logic 		  XNaNM, YNaNM, ZNaNM;                // is the input a NaN - memory stage
-     logic 		  XNaNQ, YNaNQ;                       // is the input a NaN - divide
-     logic 		  XSNaNE, YSNaNE, ZSNaNE;             // is the input a signaling NaN - execute stage
-     logic 		  XSNaNM, YSNaNM, ZSNaNM;             // is the input a signaling NaN - memory stage
-     logic 		  XDenormE, YDenormE, ZDenormE;       // is the input denormalized
-     logic 		  XZeroE, YZeroE, ZZeroE;             // is the input zero - execute stage
-     logic 		  XZeroM, YZeroM, ZZeroM;             // is the input zero - memory stage
-     logic 		  XZeroQ, YZeroQ;                     // is the input zero - divide
-     logic 		  XInfE, YInfE, ZInfE;                // is the input infinity - execute stage
-     logic 		  XInfM, YInfM, ZInfM;                // is the input infinity - memory stage
-     logic 		  XInfQ, YInfQ;                       // is the input infinity - divide
-     logic 		  XExpMaxE;                           // is the exponent all ones (max value)
-     logic 		  XNormE;                             // is normal
-     logic 		  FmtQ;
-     logic 		  FOpCtrlQ;     
-     
-     // result and flag signals
-     logic [63:0] 	  FDivResM, FDivResW;                 // divide/squareroot result
-     logic [4:0] 	  FDivFlgM;                 // divide/squareroot flags  
-     logic [63:0] 	  FMAResM, FMAResW;                   // FMA/multiply result
-     logic [4:0] 	  FMAFlgM;                   // FMA/multiply result	
-     logic [63:0] 	  ReadResW;                           // read result (load instruction)
-     logic [63:0] 	  CvtFpResE;    // add/FP -> FP convert result
-     logic [4:0] 	  CvtFpFlgE;    // add/FP -> FP convert flags
-     logic [63:0] 	  CvtResE;                   // FP <-> int convert result
-     logic [4:0] 	  CvtFlgE;                   // FP <-> int convert flags //*** trim this	
-     logic [63:0] 	  ClassResE;               // classify result
-     logic [63:0] 	  CmpResE;                   // compare result
-     logic 		  CmpNVE;                     // compare invalid flag (Not Valid)     
-     logic [63:0] 	  SgnResE;                   // sign injection result
-     logic 		  SgnNVE;                     // sign injection invalid flag (Not Valid)     
-     logic [63:0] 	  FResE, FResM, FResW;                // selected result that is ready in the memory stage
-     logic [4:0] 	  FFlgE, FFlgM;                       // selected flag that is ready in the memory stage     
-     logic [`XLEN-1:0] 	  FIntResE;     
-     logic [63:0] 	  FPUResultW;                         // final FP result being written to the FP register     
-     // other signals
-     logic 		  FDivSqrtDoneE;                      // is divide done
-     logic [63:0] 	  DivInput1E, DivInput2E;             // inputs to divide/squareroot unit
-     logic 		  load_preload;                       // enable for FF on fpdivsqrt     
-     logic [63:0] 	  AlignedSrcAE;                       // align SrcA to the floating point format
+   // regfile signals
+   logic [63:0] 	  FRD1D, FRD2D, FRD3D;                // Read Data from FP register - decode stage
+   logic [63:0] 	  FRD1E, FRD2E, FRD3E;                // Read Data from FP register - execute stage
+   logic [63:0] 	  FSrcXE;                             // Input 1 to the various units (after forwarding)
+   logic [63:0] 	  FPreSrcYE, FSrcYE;                  // Input 2 to the various units (after forwarding)
+   logic [63:0] 	  FPreSrcZE, FSrcZE;                  // Input 3 to the various units (after forwarding)

-     // DECODE STAGE
-     
-     // calculate FP control signals
-     fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
-		  .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
-		  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
-	
-     // FP register file
-     fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
-			.a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), 
-			.a4(RdW), .wd4(FPUResultW),
-			.rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
+   // unpacking signals
+   logic 		  XSgnE, YSgnE, ZSgnE;                // input's sign - execute stage
+   logic 		  XSgnM, YSgnM;                       // input's sign - memory stage
+   logic [10:0] 	  XExpE, YExpE, ZExpE;                // input's exponent - execute stage
+   logic [10:0] 	  XExpM, YExpM, ZExpM;                // input's exponent - memory stage
+   logic [52:0] 	  XManE, YManE, ZManE;                // input's fraction - execute stage
+   logic [52:0] 	  XManM, YManM, ZManM;                // input's fraction - memory stage
+   logic [10:0] 	  BiasE;                              // bias based on precision (single=7f double=3ff)
+   logic 		  XNaNE, YNaNE, ZNaNE;                // is the input a NaN - execute stage
+   logic 		  XNaNM, YNaNM, ZNaNM;                // is the input a NaN - memory stage
+   logic 		  XNaNQ, YNaNQ;                       // is the input a NaN - divide
+   logic 		  XSNaNE, YSNaNE, ZSNaNE;             // is the input a signaling NaN - execute stage
+   logic 		  XSNaNM, YSNaNM, ZSNaNM;             // is the input a signaling NaN - memory stage
+   logic 		  XDenormE, YDenormE, ZDenormE;       // is the input denormalized
+   logic 		  XZeroE, YZeroE, ZZeroE;             // is the input zero - execute stage
+   logic 		  XZeroM, YZeroM, ZZeroM;             // is the input zero - memory stage
+   logic 		  XZeroQ, YZeroQ;                     // is the input zero - divide
+   logic 		  XInfE, YInfE, ZInfE;                // is the input infinity - execute stage
+   logic 		  XInfM, YInfM, ZInfM;                // is the input infinity - memory stage
+   logic 		  XInfQ, YInfQ;                       // is the input infinity - divide
+   logic 		  XExpMaxE;                           // is the exponent all ones (max value)
+   logic 		  XNormE;                             // is normal
+   logic 		  FmtQ;
+   logic 		  FOpCtrlQ;     

-     // D/E pipeline registers
-     flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
-     flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
-     flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-     flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-                             {Adr1E, Adr2E, Adr3E});
-     flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-			       {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
-			       {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
+   // result and flag signals
+   logic [63:0] 	  FDivResM, FDivResW;                 // divide/squareroot result
+   logic [4:0] 	  FDivFlgM;                 // divide/squareroot flags  
+   logic [63:0] 	  FMAResM, FMAResW;                   // FMA/multiply result
+   logic [4:0] 	  FMAFlgM;                   // FMA/multiply result	
+   logic [63:0] 	  ReadResW;                           // read result (load instruction)
+   logic [63:0] 	  CvtFpResE;    // add/FP -> FP convert result
+   logic [4:0] 	  CvtFpFlgE;    // add/FP -> FP convert flags
+   logic [63:0] 	  CvtResE;                   // FP <-> int convert result
+   logic [4:0] 	  CvtFlgE;                   // FP <-> int convert flags //*** trim this	
+   logic [63:0] 	  ClassResE;               // classify result
+   logic [63:0] 	  CmpResE;                   // compare result
+   logic 		  CmpNVE;                     // compare invalid flag (Not Valid)     
+   logic [63:0] 	  SgnResE;                   // sign injection result
+   logic 		  SgnNVE;                     // sign injection invalid flag (Not Valid)     
+   logic [63:0] 	  FResE, FResM, FResW;                // selected result that is ready in the memory stage
+   logic [4:0] 	  FFlgE, FFlgM;                       // selected flag that is ready in the memory stage     
+   logic [`XLEN-1:0] 	  FIntResE;     
+   logic [63:0] 	  FPUResultW;                         // final FP result being written to the FP register     
+   // other signals
+   logic 		  FDivSqrtDoneE;                      // is divide done
+   logic [63:0] 	  DivInput1E, DivInput2E;             // inputs to divide/squareroot unit
+   logic 		  load_preload;                       // enable for FF on fpdivsqrt     
+   logic [63:0] 	  AlignedSrcAE;                       // align SrcA to the floating point format

-     // EXECUTION STAGE
-     // Hazard unit for FPU  
-     //    - determines if any forwarding or stalls are needed
-     fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
-                     .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
-     
-     // forwarding muxs
-     mux3  #(64)  fxemux (FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
-     mux3  #(64)  fyemux (FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
-     mux3  #(64)  fzemux (FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
-     mux3  #(64)  fyaddmux (FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
-			    {2'b0, {10{1'b1}}, 52'b0}, 
-			    {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01)}, 
-			    FSrcYE); // Force Z to be 0 for multiply instructions
-     // Force Z to be 0 for multiply instructions     
-     mux3  #(64)  fzmulmux (FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
-       
-     // unpacking unit
-     //    - splits FP inputs into their various parts
-     //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
-     unpacking unpacking (.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
-			  .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
-			  .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
-			  .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
-     
-     // FMA
-     //   - two stage FMA
-     //   - execute stage - multiplication and addend shifting
-     //   - memory stage  - addition and rounding
-     //   - handles FMA and multiply instructions
-     fma fma (.clk, .reset, .FlushM, .StallM, 
-	      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
-	      .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE,
-	      .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
-	      .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
-	      .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
-	      .FOpCtrlE,
-	      .FmtE, .FmtM, .FrmM, 
-	      .FMAFlgM, .FMAResM);
-     
-     // fpdivsqrt using Goldschmidt's iteration
-     flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
-				.clear(FDivSqrtDoneE), .en(load_preload),
-				.reset(reset),  .clk(clk));
-     flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
-			      .clear(FDivSqrtDoneE), .en(load_preload),
-			      .reset(reset),  .clk(clk));
-     flopenrc #(8) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE, FmtE, FOpCtrlE[0]}), 
-			     .q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ, FmtQ, FOpCtrlQ}),
-			     .clear(FDivSqrtDoneE), .en(load_preload),
-			     .reset(reset),  .clk(clk));
-     fpdiv_pipe fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlQ), 
-			  .reset, .clk(clk), .start(FDivStartE), .P(~FmtQ), .OvEn(1'b1), .UnEn(1'b1),
-			  .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ, .load_preload,
-			  .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
+   // DECODE STAGE

-     // convert from signle to double and vice versa
-     cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
-     
-     // compare unit
-     //    - computation is done in one stage
-     //    - writes to FP file durring min/max instructions
-     //    - other comparisons write a 1 or 0 to the integer register
-     fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
-		.FSrcXE, .FSrcYE, .FOpCtrlE, 
-		.FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
-		.Invalid(CmpNVE), .CmpResE);
-     
-     // sign injection unit
-     fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
-		.SgnNVE, .SgnResE);
-     
-     // classify
-     fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
-			  .XSNaNE, .ClassResE);
+   // calculate FP control signals
+   fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
+      .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
+      .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);

-     // Convert
-     fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .ForwardedSrcAE, .FOpCtrlE, .FmtE, .FrmE,
-		.CvtResE, .CvtFlgE);
-     
-     // data to be stored in memory - to IEU
-     //    - FP uses NaN-blocking format
-     //        - if there are any unsused bits the most significant bits are filled with 1s
-     assign FWriteDataE = FSrcYE[`XLEN-1:0];     
-     
-     // Align SrcA to MSB when single precicion
-     mux2  #(64)  SrcAMux({{32{1'b1}}, ForwardedSrcAE[31:0]}, {{64-`XLEN{1'b1}}, ForwardedSrcAE}, FmtE, AlignedSrcAE);
-     
-     // select a result that may be written to the FP register
-     mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
-     mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
-     
-     // select the result that may be written to the integer register - to IEU
-     mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], 
-			       CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
-     
-     // E/M pipe registers
+   // FP register file
+   fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
+      .a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), 
+      .a4(RdW), .wd4(FPUResultW),
+      .rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	

-     // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
-     flopenrc #(65) EMFpReg2 (clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
-     flopenrc #(65) EMFpReg3 (clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
-     flopenrc #(64) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
-     flopenrc #(12) EMFpReg5 (clk, reset, FlushM, ~StallM, 
-			      {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
-			      {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
-     flopenrc #(64) EMRegCmpRes (clk, reset, FlushM, ~StallM, FResE, FResM); 
-     flopenrc #(5)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
-     flopenrc #(`XLEN) EMRegSgnRes (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
-     flopenrc #(7) EMCtrlReg (clk, reset, FlushM, ~StallM,
-			       {FRegWriteE, FResultSelE, FrmE, FmtE},
-			       {FRegWriteM, FResultSelM, FrmM, FmtM});
-     
-     // BEGIN MEMORY STAGE
-     
-     // FPU flag selection - to privileged
-     mux4  #(5)  FPUFlgMux (5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelM, SetFflagsM);
-  
-     // M/W pipe registers
-     flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
-     flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
-     flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
-     flopenrc #(4)  MWCtrlReg(clk, reset, FlushW, ~StallW,
-			      {FRegWriteM, FResultSelM, FmtM},
-			      {FRegWriteW, FResultSelW, FmtW});
-     
-     // BEGIN WRITEBACK STAGE
-     
-     // put ReadData into NaN-blocking format
-     //    - if there are any unsused bits the most significant bits are filled with 1s
-     //    - for load instruction
-     mux2  #(64)  ReadResMux ({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
-     
-     // select the result to be written to the FP register
-     mux4  #(64)  FPUResultMux (ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
+   // D/E pipeline registers
+   flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
+   flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
+   flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
+   flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+                           {Adr1E, Adr2E, Adr3E});
+   flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+               {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
+               {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});

-  end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
-     assign FStallD = 0;
-     assign FWriteIntE = 0; 
-     assign FWriteDataE = 0;
-     assign FIntResM = 0;
-     assign FDivBusyE = 0;
-     assign IllegalFPUInstrD = 1;
-     assign SetFflagsM = 0;
-  end
-  endgenerate 
-   
+   // EXECUTION STAGE
+   // Hazard unit for FPU  
+   //    - determines if any forwarding or stalls are needed
+   fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
+                  .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
+
+   // forwarding muxs
+   mux3  #(64)  fxemux (FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
+   mux3  #(64)  fyemux (FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
+   mux3  #(64)  fzemux (FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
+   mux3  #(64)  fyaddmux (FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
+            {2'b0, {10{1'b1}}, 52'b0}, 
+            {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01)}, 
+            FSrcYE); // Force Z to be 0 for multiply instructions
+   // Force Z to be 0 for multiply instructions     
+   mux3  #(64)  fzmulmux (FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
+      
+   // unpacking unit
+   //    - splits FP inputs into their various parts
+   //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
+   unpacking unpacking (.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
+         .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
+         .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
+         .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
+
+   // FMA
+   //   - two stage FMA
+   //   - execute stage - multiplication and addend shifting
+   //   - memory stage  - addition and rounding
+   //   - handles FMA and multiply instructions
+   fma fma (.clk, .reset, .FlushM, .StallM, 
+      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
+      .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE,
+      .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
+      .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
+      .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
+      .FOpCtrlE,
+      .FmtE, .FmtM, .FrmM, 
+      .FMAFlgM, .FMAResM);
+
+   // fpdivsqrt using Goldschmidt's iteration
+   flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
+         .clear(FDivSqrtDoneE), .en(load_preload),
+         .reset(reset),  .clk(clk));
+   flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
+            .clear(FDivSqrtDoneE), .en(load_preload),
+            .reset(reset),  .clk(clk));
+   flopenrc #(8) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE, FmtE, FOpCtrlE[0]}), 
+            .q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ, FmtQ, FOpCtrlQ}),
+            .clear(FDivSqrtDoneE), .en(load_preload),
+            .reset(reset),  .clk(clk));
+   fpdiv_pipe fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlQ), 
+         .reset, .clk(clk), .start(FDivStartE), .P(~FmtQ), .OvEn(1'b1), .UnEn(1'b1),
+         .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ, .load_preload,
+         .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
+
+   // convert from signle to double and vice versa
+   cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
+
+   // compare unit
+   //    - computation is done in one stage
+   //    - writes to FP file durring min/max instructions
+   //    - other comparisons write a 1 or 0 to the integer register
+   fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
+   .FSrcXE, .FSrcYE, .FOpCtrlE, 
+   .FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
+   .Invalid(CmpNVE), .CmpResE);
+
+   // sign injection unit
+   fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
+   .SgnNVE, .SgnResE);
+
+   // classify
+   fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
+         .XSNaNE, .ClassResE);
+
+   // Convert
+   fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .ForwardedSrcAE, .FOpCtrlE, .FmtE, .FrmE,
+   .CvtResE, .CvtFlgE);
+
+   // data to be stored in memory - to IEU
+   //    - FP uses NaN-blocking format
+   //        - if there are any unsused bits the most significant bits are filled with 1s
+   assign FWriteDataE = FSrcYE[`XLEN-1:0];     
+
+   // Align SrcA to MSB when single precicion
+   mux2  #(64)  SrcAMux({{32{1'b1}}, ForwardedSrcAE[31:0]}, {{64-`XLEN{1'b1}}, ForwardedSrcAE}, FmtE, AlignedSrcAE);
+
+   // select a result that may be written to the FP register
+   mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
+   mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
+
+   // select the result that may be written to the integer register - to IEU
+   mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], 
+               CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
+
+   // E/M pipe registers
+
+   // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
+   flopenrc #(65) EMFpReg2 (clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
+   flopenrc #(65) EMFpReg3 (clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
+   flopenrc #(64) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
+   flopenrc #(12) EMFpReg5 (clk, reset, FlushM, ~StallM, 
+            {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
+            {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
+   flopenrc #(64) EMRegCmpRes (clk, reset, FlushM, ~StallM, FResE, FResM); 
+   flopenrc #(5)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
+   flopenrc #(`XLEN) EMRegSgnRes (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
+   flopenrc #(7) EMCtrlReg (clk, reset, FlushM, ~StallM,
+               {FRegWriteE, FResultSelE, FrmE, FmtE},
+               {FRegWriteM, FResultSelM, FrmM, FmtM});
+
+   // BEGIN MEMORY STAGE
+
+   // FPU flag selection - to privileged
+   mux4  #(5)  FPUFlgMux (5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelM, SetFflagsM);
+
+   // M/W pipe registers
+   flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
+   flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
+   flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
+   flopenrc #(4)  MWCtrlReg(clk, reset, FlushW, ~StallW,
+            {FRegWriteM, FResultSelM, FmtM},
+            {FRegWriteW, FResultSelW, FmtW});
+
+   // BEGIN WRITEBACK STAGE
+
+   // put ReadData into NaN-blocking format
+   //    - if there are any unsused bits the most significant bits are filled with 1s
+   //    - for load instruction
+   mux2  #(64)  ReadResMux ({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
+
+   // select the result to be written to the FP register
+   mux4  #(64)  FPUResultMux (ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
 endmodule // fpu
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@ -40,65 +40,50 @@ module muldiv (
 	       input logic 		StallM, StallW, FlushM, FlushW 
 	       );

-   generate
-      if (`M_SUPPORTED) begin
-	 logic [`XLEN-1:0] MulDivResultM;
-	 logic [`XLEN-1:0] PrelimResultM;
-	 logic [`XLEN-1:0] QuotM, RemM;
-	 logic [`XLEN*2-1:0] ProdM; 
+	logic [`XLEN-1:0] MulDivResultM;
+	logic [`XLEN-1:0] PrelimResultM;
+	logic [`XLEN-1:0] QuotM, RemM;
+	logic [`XLEN*2-1:0] ProdM; 

-	 logic 		     DivE;
-	 logic 		     DivSignedE;	
-	 logic           W64M; 
-	 
-	 // Multiplier
-	 mul mul(
-	 .clk, .reset,
-  	 .StallM, .FlushM,
-	    // .SrcAE, .SrcBE,
-	 .ForwardedSrcAE, .ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
-	 .Funct3E,
-  	 .ProdM
-	 );
+	logic 		     DivE;
+	logic 		     DivSignedE;	
+	logic           W64M; 

-	 // Divide
-	 // Start a divide when a new division instruction is received and the divider isn't already busy or finishing
-	 assign DivE = MulDivE & Funct3E[2];
-	 assign DivSignedE = ~Funct3E[0];
-	 intdivrestoring div(.clk, .reset, .StallM,
-	   .DivSignedE, .W64E, .DivE, .ForwardedSrcAE, .ForwardedSrcBE, .DivBusyE, .QuotM, .RemM);
-	 	 
-	 // Result multiplexer
-	 always_comb
-           case (Funct3M)	   
-             3'b000: PrelimResultM = ProdM[`XLEN-1:0];
-             3'b001: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
-             3'b010: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
-             3'b011: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
-             3'b100: PrelimResultM = QuotM;
-             3'b101: PrelimResultM = QuotM;
-             3'b110: PrelimResultM = RemM;
-             3'b111: PrelimResultM = RemM;
-           endcase 
-	 
-	 // Handle sign extension for W-type instructions
-	 flopenrc #(1) W64MReg(clk, reset, FlushM, ~StallM, W64E, W64M);
-	 if (`XLEN == 64) begin // RV64 has W-type instructions
-            assign MulDivResultM = W64M ? {{32{PrelimResultM[31]}}, PrelimResultM[31:0]} : PrelimResultM;
-	 end else begin // RV32 has no W-type instructions
-            assign MulDivResultM = PrelimResultM;
-	 end
+	// Multiplier
+	mul mul(.clk, .reset, .StallM, .FlushM, .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .ProdM);

-     // Writeback stage pipeline register
+	// Divide
+	// Start a divide when a new division instruction is received and the divider isn't already busy or finishing
+	assign DivE = MulDivE & Funct3E[2];
+	assign DivSignedE = ~Funct3E[0];
+	intdivrestoring div(.clk, .reset, .StallM, .DivSignedE, .W64E, .DivE, 
+	                    .ForwardedSrcAE, .ForwardedSrcBE, .DivBusyE, .QuotM, .RemM);
+		
+	// Result multiplexer
+	always_comb
+		case (Funct3M)	   
+			3'b000: PrelimResultM = ProdM[`XLEN-1:0];
+			3'b001: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
+			3'b010: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
+			3'b011: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
+			3'b100: PrelimResultM = QuotM;
+			3'b101: PrelimResultM = QuotM;
+			3'b110: PrelimResultM = RemM;
+			3'b111: PrelimResultM = RemM;
+		endcase 

-	 flopenrc #(`XLEN) MulDivResultWReg(clk, reset, FlushW, ~StallW, MulDivResultM, MulDivResultW);	 
-
-      end else begin // no M instructions supported
-	 	assign MulDivResultW = 0; 
-		assign DivBusyE = 0;
-      end
-   endgenerate
+	// Handle sign extension for W-type instructions
+	flopenrc #(1) W64MReg(clk, reset, FlushM, ~StallM, W64E, W64M);
+	generate
+		if (`XLEN == 64) begin:resmux // RV64 has W-type instructions
+			assign MulDivResultM = W64M ? {{32{PrelimResultM[31]}}, PrelimResultM[31:0]} : PrelimResultM;
+		end else begin:resmux // RV32 has no W-type instructions
+			assign MulDivResultM = PrelimResultM;
+		end
+	endgenerate

+	// Writeback stage pipeline register
+	flopenrc #(`XLEN) MulDivResultWReg(clk, reset, FlushW, ~StallW, MulDivResultM, MulDivResultW);	 
 endmodule // muldiv


--- a/wally-pipelined/src/privileged/csr.sv
+++ b/wally-pipelined/src/privileged/csr.sv
@ -86,76 +86,48 @@ module csr #(parameter
  logic        IllegalCSRCAccessM, IllegalCSRMAccessM, IllegalCSRSAccessM, IllegalCSRUAccessM, IllegalCSRNAccessM, InsufficientCSRPrivilegeM;
  logic IllegalCSRMWriteReadonlyM;
  
-  generate
-    if (`ZICSR_SUPPORTED) begin
-      // modify CSRs
-      always_comb begin
-        // Choose either rs1 or uimm[4:0] as source
-        CSRSrcM = InstrM[14] ? {{(`XLEN-5){1'b0}}, InstrM[19:15]} : SrcAM;
-        // Compute AND/OR modification
-        CSRRWM = CSRSrcM;
-        CSRRSM = CSRReadValM | CSRSrcM;
-        CSRRCM = CSRReadValM & ~CSRSrcM;
-        case (InstrM[13:12])
-          2'b01:  CSRWriteValM = CSRRWM;
-          2'b10:  CSRWriteValM = CSRRSM;
-          2'b11:  CSRWriteValM = CSRRCM;
-          default: CSRWriteValM = CSRReadValM;
-        endcase
-      end
+  // modify CSRs
+  always_comb begin
+    // Choose either rs1 or uimm[4:0] as source
+    CSRSrcM = InstrM[14] ? {{(`XLEN-5){1'b0}}, InstrM[19:15]} : SrcAM;
+    // Compute AND/OR modification
+    CSRRWM = CSRSrcM;
+    CSRRSM = CSRReadValM | CSRSrcM;
+    CSRRCM = CSRReadValM & ~CSRSrcM;
+    case (InstrM[13:12])
+      2'b01:  CSRWriteValM = CSRRWM;
+      2'b10:  CSRWriteValM = CSRRSM;
+      2'b11:  CSRWriteValM = CSRRCM;
+      default: CSRWriteValM = CSRReadValM;
+    endcase
+  end

-      // write CSRs
-      assign CSRAdrM = InstrM[31:20];
-      assign UnalignedNextEPCM = TrapM ? PCM : CSRWriteValM;
-      assign NextEPCM = `C_SUPPORTED ? {UnalignedNextEPCM[`XLEN-1:1], 1'b0} : {UnalignedNextEPCM[`XLEN-1:2], 2'b00}; // 3.1.15 alignment
-      assign NextCauseM = TrapM ? CauseM : CSRWriteValM;
-      assign NextMtvalM = TrapM ? NextFaultMtvalM : CSRWriteValM;
-      assign CSRMWriteM = CSRWriteM && (PrivilegeModeW == `M_MODE);
-      assign CSRSWriteM = CSRWriteM && (|PrivilegeModeW);
-      assign CSRUWriteM = CSRWriteM;  
+  // write CSRs
+  assign CSRAdrM = InstrM[31:20];
+  assign UnalignedNextEPCM = TrapM ? PCM : CSRWriteValM;
+  assign NextEPCM = `C_SUPPORTED ? {UnalignedNextEPCM[`XLEN-1:1], 1'b0} : {UnalignedNextEPCM[`XLEN-1:2], 2'b00}; // 3.1.15 alignment
+  assign NextCauseM = TrapM ? CauseM : CSRWriteValM;
+  assign NextMtvalM = TrapM ? NextFaultMtvalM : CSRWriteValM;
+  assign CSRMWriteM = CSRWriteM && (PrivilegeModeW == `M_MODE);
+  assign CSRSWriteM = CSRWriteM && (|PrivilegeModeW);
+  assign CSRUWriteM = CSRWriteM;  

-      csri  csri(.*);
-      csrsr csrsr(.*);
-      csrc  counters(.*);
-      csrm  csrm(.*); // Machine Mode CSRs
-      csrs  csrs(.*);
-      csrn  csrn(.CSRNWriteM(CSRUWriteM), .*);  // User Mode Exception Registers
-      csru  csru(.*); // Floating Point Flags are part of User MOde
+  csri  csri(.*);
+  csrsr csrsr(.*);
+  csrc  counters(.*);
+  csrm  csrm(.*); // Machine Mode CSRs
+  csrs  csrs(.*);
+  csrn  csrn(.CSRNWriteM(CSRUWriteM), .*);  // User Mode Exception Registers
+  csru  csru(.*); // Floating Point Flags are part of User MOde

-      // merge CSR Reads
-      assign CSRReadValM = CSRUReadValM | CSRSReadValM | CSRMReadValM | CSRCReadValM | CSRNReadValM; 
-      // *** add W stall 2/22/21 dh to try fixing memory stalls
-//      floprc #(`XLEN) CSRValWReg(clk, reset, FlushW, CSRReadValM, CSRReadValW);
-      flopenrc #(`XLEN) CSRValWReg(clk, reset, FlushW, ~StallW, CSRReadValM, CSRReadValW);
+  // merge CSR Reads
+  assign CSRReadValM = CSRUReadValM | CSRSReadValM | CSRMReadValM | CSRCReadValM | CSRNReadValM; 
+  flopenrc #(`XLEN) CSRValWReg(clk, reset, FlushW, ~StallW, CSRReadValM, CSRReadValW);

-      // merge illegal accesses: illegal if none of the CSR addresses is legal or privilege is insufficient
-      assign InsufficientCSRPrivilegeM = (CSRAdrM[9:8] == 2'b11 && PrivilegeModeW != `M_MODE) ||
-                                        (CSRAdrM[9:8] == 2'b01 && PrivilegeModeW == `U_MODE);
-      assign IllegalCSRAccessM = ((IllegalCSRCAccessM && IllegalCSRMAccessM && 
-        IllegalCSRSAccessM && IllegalCSRUAccessM  && IllegalCSRNAccessM ||
-        InsufficientCSRPrivilegeM) && CSRReadM) || IllegalCSRMWriteReadonlyM;
-    end else begin // CSRs not implemented
-      assign STATUS_MPP = 2'b11;
-      assign STATUS_SPP = 2'b0;
-      assign STATUS_TSR = 0;
-      assign MEPC_REGW = 0;
-      assign SEPC_REGW = 0;
-      assign UEPC_REGW = 0;
-      assign UTVEC_REGW = 0;
-      assign STVEC_REGW = 0;
-      assign MTVEC_REGW = 0;
-      assign MEDELEG_REGW = 0;
-      assign MIDELEG_REGW = 0;
-      assign SEDELEG_REGW = 0;
-      assign SIDELEG_REGW = 0;
-      assign SATP_REGW = 0;
-      assign MIP_REGW = 0;
-      assign MIE_REGW = 0;
-      assign STATUS_MIE = 0;
-      assign STATUS_SIE = 0;
-      assign FRM_REGW = 0;
-      assign CSRReadValM = 0;
-      assign IllegalCSRAccessM = CSRReadM;
-    end
-  endgenerate
+  // merge illegal accesses: illegal if none of the CSR addresses is legal or privilege is insufficient
+  assign InsufficientCSRPrivilegeM = (CSRAdrM[9:8] == 2'b11 && PrivilegeModeW != `M_MODE) ||
+                                    (CSRAdrM[9:8] == 2'b01 && PrivilegeModeW == `U_MODE);
+  assign IllegalCSRAccessM = ((IllegalCSRCAccessM && IllegalCSRMAccessM && 
+    IllegalCSRSAccessM && IllegalCSRUAccessM  && IllegalCSRNAccessM ||
+    InsufficientCSRPrivilegeM) && CSRReadM) || IllegalCSRMWriteReadonlyM;
 endmodule
--- a/wally-pipelined/src/privileged/privileged.sv
+++ b/wally-pipelined/src/privileged/privileged.sv
@ -239,8 +239,6 @@ module privileged (
            .ExceptionM,
            .PendingInterruptM,
            .PrivilegedNextPCM, .CauseM, .NextFaultMtvalM);
-
-
 endmodule


--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@ -198,7 +198,6 @@ module wallypipelinedhart (
 	  
 	  ); // instruction fetch unit: PC, branch prediction, instruction cache
    
-
  ieu ieu(
     .clk, .reset,

@ -276,7 +275,7 @@ module wallypipelinedhart (
 	.LSUStall);                     // change to LSUStall


-  
+   // *** Ross: please make EBU conditional when only supporting internal memories

  ahblite ebu(// IFU connections
     .clk, .reset,
@ -295,22 +294,7 @@ module wallypipelinedhart (
     .HWRITED);

  
-  muldiv mdu(
-     .clk, .reset,
-	// Execute Stage interface
-	//   .SrcAE, .SrcBE,
-	.ForwardedSrcAE, .ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
-	.Funct3E, .Funct3M,
-     .MulDivE, .W64E,
-	// Writeback stage
-     .MulDivResultW,
-     // Divide Done
-	.DivBusyE, 
-	// hazards
-	.StallM, .StallW, .FlushM, .FlushW 
-  ); // multiply and divide unit
-  
-  hazard     hzu(
+   hazard     hzu(
     .BPPredWrongE, .CSRWritePendingDEM, .RetM, .TrapM,
     .LoadStallD, .StoreStallD, .MulDivStallD, .CSRRdStallD,
     .LSUStall, .ICacheStallF,
@ -323,57 +307,89 @@ module wallypipelinedhart (
 	.FlushF, .FlushD, .FlushE, .FlushM, .FlushW
     );	// global stall and flush control

-  // Priveleged block operates in M and W stages, handling CSRs and exceptions
-  privileged priv(
-     .clk, .reset,
-     .FlushD, .FlushE, .FlushM, .FlushW, 
-     .StallD, .StallE, .StallM, .StallW,
-     .CSRReadM, .CSRWriteM, .SrcAM, .PCM,
-     .InstrM, .CSRReadValW, .PrivilegedNextPCM,
-     .RetM, .TrapM, 
-     .ITLBFlushF, .DTLBFlushM,
-     .InstrValidM, .CommittedM,
-     .FRegWriteM, .LoadStallD,
-     .BPPredDirWrongM, .BTBPredPCWrongM,
-     .RASPredPCWrongM, .BPPredClassNonCFIWrongM,
-     .InstrClassM, .DCacheMiss, .DCacheAccess, .PrivilegedM,
-     .ITLBInstrPageFaultF, .DTLBLoadPageFaultM, .DTLBStorePageFaultM,
-     .WalkerInstrPageFaultF, .WalkerLoadPageFaultM, .WalkerStorePageFaultM,
-     .InstrMisalignedFaultM, .IllegalIEUInstrFaultD, .IllegalFPUInstrD,
-     .LoadMisalignedFaultM, .StoreMisalignedFaultM,
-     .TimerIntM, .ExtIntM, .SwIntM,
-     .MTIME_CLINT, .MTIMECMP_CLINT,
-     .InstrMisalignedAdrM, .MemAdrM,
-     .SetFflagsM,
-     // Trap signals from pmp/pma in mmu
-     // *** do these need to be split up into one for dmem and one for ifu?
-     // instead, could we only care about the instr and F pins that come from ifu and only care about the load/store and m pins that come from dmem?
-     .InstrAccessFaultF, .LoadAccessFaultM, .StoreAccessFaultM,
-     .ExceptionM, .PendingInterruptM, .IllegalFPUInstrE,
-     .PrivilegeModeW, .SATP_REGW,
-     .STATUS_MXR, .STATUS_SUM, .STATUS_MPRV, .STATUS_MPP,
-     .PMPCFG_ARRAY_REGW, .PMPADDR_ARRAY_REGW, 
-     .FRM_REGW,.BreakpointFaultM, .EcallFaultM
-  );
-  
+   generate
+      if (`ZICSR_SUPPORTED) begin:priv
+         privileged priv(
+            .clk, .reset,
+            .FlushD, .FlushE, .FlushM, .FlushW, 
+            .StallD, .StallE, .StallM, .StallW,
+            .CSRReadM, .CSRWriteM, .SrcAM, .PCM,
+            .InstrM, .CSRReadValW, .PrivilegedNextPCM,
+            .RetM, .TrapM, 
+            .ITLBFlushF, .DTLBFlushM,
+            .InstrValidM, .CommittedM,
+            .FRegWriteM, .LoadStallD,
+            .BPPredDirWrongM, .BTBPredPCWrongM,
+            .RASPredPCWrongM, .BPPredClassNonCFIWrongM,
+            .InstrClassM, .DCacheMiss, .DCacheAccess, .PrivilegedM,
+            .ITLBInstrPageFaultF, .DTLBLoadPageFaultM, .DTLBStorePageFaultM,
+            .WalkerInstrPageFaultF, .WalkerLoadPageFaultM, .WalkerStorePageFaultM,
+            .InstrMisalignedFaultM, .IllegalIEUInstrFaultD, .IllegalFPUInstrD,
+            .LoadMisalignedFaultM, .StoreMisalignedFaultM,
+            .TimerIntM, .ExtIntM, .SwIntM,
+            .MTIME_CLINT, .MTIMECMP_CLINT,
+            .InstrMisalignedAdrM, .MemAdrM,
+            .SetFflagsM,
+            // Trap signals from pmp/pma in mmu
+            // *** do these need to be split up into one for dmem and one for ifu?
+            // instead, could we only care about the instr and F pins that come from ifu and only care about the load/store and m pins that come from dmem?
+            .InstrAccessFaultF, .LoadAccessFaultM, .StoreAccessFaultM,
+            .ExceptionM, .PendingInterruptM, .IllegalFPUInstrE,
+            .PrivilegeModeW, .SATP_REGW,
+            .STATUS_MXR, .STATUS_SUM, .STATUS_MPRV, .STATUS_MPP,
+            .PMPCFG_ARRAY_REGW, .PMPADDR_ARRAY_REGW, 
+            .FRM_REGW,.BreakpointFaultM, .EcallFaultM
+         );
+      end else begin
+         assign CSRReadValW = 0;
+         assign PrivilegedNextPCM = 0;
+         assign RetM = 0;
+         assign TrapM = 0;
+         assign ITLBFlushF = 0;
+         assign DTLBFlushM = 0;
+      end
+      if (`M_SUPPORTED) begin:mdu
+         muldiv mdu(
+            .clk, .reset,
+            .ForwardedSrcAE, .ForwardedSrcBE, 
+            .Funct3E, .Funct3M, .MulDivE, .W64E,
+            .MulDivResultW, .DivBusyE, 
+            .StallM, .StallW, .FlushM, .FlushW 
+         ); 
+      end else begin // no M instructions supported
+         assign MulDivResultW = 0; 
+         assign DivBusyE = 0;
+      end

-  fpu fpu(
-     .clk, .reset,
-     .FRM_REGW, // Rounding mode from CSR
-     .InstrD, // instruction from IFU
-     .ReadDataW,// Read data from memory
-     .ForwardedSrcAE, // Integer input being processed (from IEU)
-     .StallE, .StallM, .StallW, // stall signals from HZU
-     .FlushE, .FlushM, .FlushW, // flush signals from HZU
-     .RdM, .RdW, // which FP register to write to (from IEU)
-     .FRegWriteM, // FP register write enable
-     .FStallD, // Stall the decode stage
-     .FWriteIntE, // integer register write enable
-     .FWriteDataE, // Data to be written to memory
-     .FIntResM, // data to be written to integer register
-     .FDivBusyE, // Is the divide/sqrt unit busy (stall execute stage)
-     .IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
-     .SetFflagsM        // FPU flags (to privileged unit)
-  ); // floating point unit
-  
+      if (`F_SUPPORTED) begin:fpu
+         fpu fpu(
+            .clk, .reset,
+            .FRM_REGW, // Rounding mode from CSR
+            .InstrD, // instruction from IFU
+            .ReadDataW,// Read data from memory
+            .ForwardedSrcAE, // Integer input being processed (from IEU)
+            .StallE, .StallM, .StallW, // stall signals from HZU
+            .FlushE, .FlushM, .FlushW, // flush signals from HZU
+            .RdM, .RdW, // which FP register to write to (from IEU)
+            .FRegWriteM, // FP register write enable
+            .FStallD, // Stall the decode stage
+            .FWriteIntE, // integer register write enable
+            .FWriteDataE, // Data to be written to memory
+            .FIntResM, // data to be written to integer register
+            .FDivBusyE, // Is the divide/sqrt unit busy (stall execute stage)
+            .IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+            .SetFflagsM        // FPU flags (to privileged unit)
+         ); // floating point unit
+      end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
+         assign FStallD = 0;
+         assign FWriteIntE = 0; 
+         assign FWriteDataE = 0;
+         assign FIntResM = 0;
+         assign FDivBusyE = 0;
+         assign IllegalFPUInstrD = 1;
+         assign SetFflagsM = 0;
+      end
+
+   endgenerate
+  // Priveleged block operates in M and W stages, handling CSRs and exceptions  
 endmodule
--- a/wally-pipelined/testbench/sdc/ram2sdLoad.py
+++ b/wally-pipelined/testbench/sdc/ram2sdLoad.py
--- a/wally-pipelined/testbench/sdc/ramdisk2.hex
+++ b/wally-pipelined/testbench/sdc/ramdisk2.hex
--- a/wally-pipelined/testbench/sdc/run_tb.do
+++ b/wally-pipelined/testbench/sdc/run_tb.do
--- a/wally-pipelined/testbench/sdc/sd_top_tb.sv
+++ b/wally-pipelined/testbench/sdc/sd_top_tb.sv
--- a/wally-pipelined/testbench/sdc/wave.do
+++ b/wally-pipelined/testbench/sdc/wave.do
--- a/wally-pipelined/testbench/testbench-linux.sv
+++ b/wally-pipelined/testbench/testbench-linux.sv
@ -174,7 +174,7 @@ module testbench();
  // Useful Aliases
  `define RF          dut.hart.ieu.dp.regf.rf
  `define PC          dut.hart.ifu.pcreg.q
-  `define CSR_BASE    dut.hart.priv.csr.genblk1
+  `define CSR_BASE    dut.hart.priv.priv.csr
  `define HPMCOUNTER  `CSR_BASE.counters.genblk1.HPMCOUNTER_REGW
  `define PMP_BASE    `CSR_BASE.csrm.genblk4
  `define PMPCFG      genblk2.PMPCFGreg.q
@ -210,8 +210,8 @@ module testbench();
  `define STATUS_MIE  `CSR_BASE.csrsr.STATUS_MIE
  `define STATUS_SIE  `CSR_BASE.csrsr.STATUS_SIE
  `define STATUS_UIE  `CSR_BASE.csrsr.STATUS_UIE
-  `define PRIV        dut.hart.priv.privmodereg.q
-  `define INSTRET     dut.hart.priv.csr.genblk1.counters.genblk1.genblk2.INSTRETreg.q
+  `define PRIV        dut.hart.priv.priv.privmodereg.q
+  `define INSTRET     dut.hart.priv.priv.csr.counters.genblk1.genblk2.INSTRETreg.q
  // Common Macros
  `define checkCSR(CSR) \
    begin \
@ -308,9 +308,9 @@ module testbench();
  integer ramFile;
  integer readResult;
  initial begin
-    force dut.hart.priv.SwIntM = 0;
-    force dut.hart.priv.TimerIntM = 0;
-    force dut.hart.priv.ExtIntM = 0;    
+    force dut.hart.priv.priv.SwIntM = 0;
+    force dut.hart.priv.priv.TimerIntM = 0;
+    force dut.hart.priv.priv.ExtIntM = 0;    
    $readmemh({`LINUX_TEST_VECTORS,"bootmem.txt"}, dut.uncore.bootrom.bootrom.RAM, 'h1000 >> 3);
    $readmemb(`TWO_BIT_PRELOAD, dut.hart.ifu.bpred.bpred.Predictor.DirPredictor.PHT.mem);
    $readmemb(`BTB_PRELOAD, dut.hart.ifu.bpred.bpred.TargetPredictor.memory.mem);
@ -365,7 +365,7 @@ module testbench();
  // on the next falling edge the expected state is compared to the wally state.

  // step 0: read the expected state
-  assign checkInstrM = dut.hart.ieu.InstrValidM & ~dut.hart.priv.trap.InstrPageFaultM & ~dut.hart.priv.trap.InterruptM & ~dut.hart.StallM;
+  assign checkInstrM = dut.hart.ieu.InstrValidM & ~dut.hart.priv.priv.trap.InstrPageFaultM & ~dut.hart.priv.priv.trap.InterruptM & ~dut.hart.StallM;
  `define SCAN_NEW_INSTR_FROM_TRACE(STAGE) \
    // always check PC, instruction bits \
    if (checkInstrM) begin \
@ -479,7 +479,7 @@ module testbench();
      end else begin // update MIP immediately
        $display("%tns: Updating MIP to %x",$time,NextMIPexpected);
        MIPexpected = NextMIPexpected;
-        force dut.hart.priv.csr.genblk1.csri.MIP_REGW = MIPexpected;
+        force dut.hart.priv.priv.csr.csri.MIP_REGW = MIPexpected;
      end
      // $display("%tn: ExpectedCSRArrayM = %p",$time,ExpectedCSRArrayM);
      // $display("%tn: ExpectedCSRArrayValueM = %p",$time,ExpectedCSRArrayValueM);
@ -491,11 +491,11 @@ module testbench();
      // $display("%tn: ExpectedCSRArrayValueM[NumCSRM] %x",$time,ExpectedCSRArrayValueM[NumCSRM]);
    end
    if(RequestDelayedMIP & checkInstrM) begin
-      $display("%tns: Executing Delayed MIP. Current MEPC value is %x",$time,dut.hart.priv.csr.genblk1.csrm.MEPC_REGW);
+      $display("%tns: Executing Delayed MIP. Current MEPC value is %x",$time,dut.hart.priv.priv.csr.csrm.MEPC_REGW);
      $display("%tns: Updating MIP to %x",$time,NextMIPexpected);
      MIPexpected = NextMIPexpected;
-      force dut.hart.priv.csr.genblk1.csri.MIP_REGW = MIPexpected;
-      $display("%tns: Finished Executing Delayed MIP. Current MEPC value is %x",$time,dut.hart.priv.csr.genblk1.csrm.MEPC_REGW);
+      force dut.hart.priv.priv.csr.csri.MIP_REGW = MIPexpected;
+      $display("%tns: Finished Executing Delayed MIP. Current MEPC value is %x",$time,dut.hart.priv.priv.csr.csrm.MEPC_REGW);
      RequestDelayedMIP = 0;
    end
  end
@ -576,7 +576,7 @@ module testbench();
        `checkEQ("PCW",PCW,ExpectedPCW)
        //`checkEQ("InstrW",InstrW,ExpectedInstrW) <-- not viable because of
        // compressed to uncompressed conversion
-        `checkEQ("Instr Count",dut.hart.priv.csr.genblk1.counters.genblk1.INSTRET_REGW,InstrCountW)
+        `checkEQ("Instr Count",dut.hart.priv.priv.csr.counters.genblk1.INSTRET_REGW,InstrCountW)
        #2; // delay 2 ns.
        if(`DEBUG_TRACE >= 5) begin
          $display("%tns, %d instrs: Reg Write Address %02d ? expected value: %02d", $time, InstrCountW, dut.hart.ieu.dp.regf.a3, ExpectedRegAdrW);
@ -601,19 +601,19 @@ module testbench();
        // check csr
        for(NumCSRPostWIndex = 0; NumCSRPostWIndex < NumCSRW; NumCSRPostWIndex++) begin
          case(ExpectedCSRArrayW[NumCSRPostWIndex])
-            "mhartid": `checkCSR(dut.hart.priv.csr.genblk1.csrm.MHARTID_REGW)
-            "mstatus": `checkCSR(dut.hart.priv.csr.genblk1.csrm.MSTATUS_REGW)
-            "mtvec":   `checkCSR(dut.hart.priv.csr.genblk1.csrm.MTVEC_REGW)
-            "mip":     `checkCSR(dut.hart.priv.csr.genblk1.csrm.MIP_REGW)
-            "mie":     `checkCSR(dut.hart.priv.csr.genblk1.csrm.MIE_REGW)
-            "mideleg": `checkCSR(dut.hart.priv.csr.genblk1.csrm.MIDELEG_REGW)
-            "medeleg": `checkCSR(dut.hart.priv.csr.genblk1.csrm.MEDELEG_REGW)
-            "mepc":    `checkCSR(dut.hart.priv.csr.genblk1.csrm.MEPC_REGW)
-            "mtval":   `checkCSR(dut.hart.priv.csr.genblk1.csrm.MTVAL_REGW)
-            "sepc":    `checkCSR(dut.hart.priv.csr.genblk1.csrs.SEPC_REGW)
-            "scause":  `checkCSR(dut.hart.priv.csr.genblk1.csrs.genblk1.SCAUSE_REGW)
-            "stvec":   `checkCSR(dut.hart.priv.csr.genblk1.csrs.STVEC_REGW)
-            "stval":   `checkCSR(dut.hart.priv.csr.genblk1.csrs.genblk1.STVAL_REGW)
+            "mhartid": `checkCSR(dut.hart.priv.priv.csr.csrm.MHARTID_REGW)
+            "mstatus": `checkCSR(dut.hart.priv.priv.csr.csrm.MSTATUS_REGW)
+            "mtvec":   `checkCSR(dut.hart.priv.priv.csr.csrm.MTVEC_REGW)
+            "mip":     `checkCSR(dut.hart.priv.priv.csr.csrm.MIP_REGW)
+            "mie":     `checkCSR(dut.hart.priv.priv.csr.csrm.MIE_REGW)
+            "mideleg": `checkCSR(dut.hart.priv.priv.csr.csrm.MIDELEG_REGW)
+            "medeleg": `checkCSR(dut.hart.priv.priv.csr.csrm.MEDELEG_REGW)
+            "mepc":    `checkCSR(dut.hart.priv.priv.csr.csrm.MEPC_REGW)
+            "mtval":   `checkCSR(dut.hart.priv.priv.csr.csrm.MTVAL_REGW)
+            "sepc":    `checkCSR(dut.hart.priv.priv.csr.csrs.SEPC_REGW)
+            "scause":  `checkCSR(dut.hart.priv.priv.csr.csrs.genblk1.SCAUSE_REGW)
+            "stvec":   `checkCSR(dut.hart.priv.priv.csr.csrs.STVEC_REGW)
+            "stval":   `checkCSR(dut.hart.priv.priv.csr.csrs.genblk1.STVAL_REGW)
          endcase
        end
        if (fault == 1) begin
@ -667,7 +667,7 @@ module testbench();
    begin
      int i;
      // Grab the SATP register from privileged unit
-      SATP = dut.hart.priv.csr.SATP_REGW;
+      SATP = dut.hart.priv.priv.csr.SATP_REGW;
      // Split the virtual address into page number segments and offset
      VPN[2] = adrIn[38:30];
      VPN[1] = adrIn[29:21];
@ -677,7 +677,7 @@ module testbench();
      SvMode = SATP[63];
      // Only perform translation if translation is on and the processor is not
      // in machine mode
-      if (SvMode && (dut.hart.priv.PrivilegeModeW != `M_MODE)) begin
+      if (SvMode && (dut.hart.priv.priv.PrivilegeModeW != `M_MODE)) begin
        BaseAdr = SATP[43:0] << 12;
        for (i = 2; i >= 0; i--) begin
          PAdr = BaseAdr + (VPN[i] << 3);
--- a/wally-pipelined/testbench/testbench.sv
+++ b/wally-pipelined/testbench/testbench.sv
@ -287,7 +287,7 @@ logic [3:0] dummy;

  // Termination condition
  // terminate on a specific ECALL for Imperas tests, or on a jump to self infinite loop for RISC-V Arch tests
-  assign DCacheFlushStart = dut.hart.priv.EcallFaultM && 
+  assign DCacheFlushStart = dut.hart.priv.priv.EcallFaultM && 
 			    (dut.hart.ieu.dp.regf.rf[3] == 1 || 
 			     (dut.hart.ieu.dp.regf.we3 && 
 			      dut.hart.ieu.dp.regf.a3 == 3 && 
@ -318,7 +318,7 @@ module riscvassertions;
  initial begin
    assert (`PMP_ENTRIES == 0 || `PMP_ENTRIES==16 || `PMP_ENTRIES==64) else $error("Illegal number of PMP entries: PMP_ENTRIES must be 0, 16, or 64");
    assert (`DIV_BITSPERCYCLE == 1 || `DIV_BITSPERCYCLE==2 || `DIV_BITSPERCYCLE==4) else $error("Illegal number of divider bits/cycle: DIV_BITSPERCYCLE must be 1, 2, or 4");
-    assert (`F_SUPPORTED || ~`D_SUPPORTED) else $error("Can't support double without supporting float");
+    assert (`F_SUPPORTED || ~`D_SUPPORTED) else $error("Can't support double (D) without supporting float (F)");
    assert (`XLEN == 64 || ~`D_SUPPORTED) else $error("Wally does not yet support D extensions on RV32");
    assert (`DCACHE_WAYSIZEINBYTES <= 4096 || `MEM_DCACHE == 0 || `MEM_VIRTMEM == 0) else $error("DCACHE_WAYSIZEINBYTES cannot exceed 4 KiB when caches and vitual memory is enabled (to prevent aliasing)");
    assert (`DCACHE_BLOCKLENINBITS >= 128 || `MEM_DCACHE == 0) else $error("DCACHE_BLOCKLENINBITS must be at least 128 when caches are enabled");