cvw/src/fpu/fpu.sv

///////////////////////////////////////////
// fpu.sv
//
// Written: me@KatherineParry.com, James Stine, Brett Mathis, David Harris
// Modified: 6/23/2021
//
// Purpose: Floating Point Unit Top-Level Interface
// 
// Documentation: RISC-V System on Chip Design Chapter 13
//
// A component of the CORE-V-WALLY configurable RISC-V project.
// 
// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
//
// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
//
// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
// may obtain a copy of the License at
//
// https://solderpad.org/licenses/SHL-2.1/
//
// Unless required by applicable law or agreed to in writing, any work distributed under the 
// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
// either express or implied. See the License for the specific language governing permissions 
// and limitations under the License.
////////////////////////////////////////////////////////////////////////////////////////////////

`include "wally-config.vh"

module fpu (
  input  logic 		          clk,
  input  logic 		          reset,
  // Hazards
  input  logic 		          StallE, StallM, StallW, // stall signals (from HZU)
  input  logic 		          FlushE, FlushM, FlushW, // flush signals (from HZU)
  output logic 		          FPUStallD,     // Stall the decode stage (To HZU)
  output logic 		          FDivBusyE,     // Is the divide/sqrt unit busy (stall execute stage) (to HZU)
  // CSRs
  input  logic [1:0]        STATUS_FS,     // Is floating-point enabled? (From privileged unit)
  input  logic [2:0] 	      FRM_REGW,      // Rounding mode (from CSR)
  // Decode stage 
  input  logic [31:0] 	    InstrD,        // instruction (from IFU)
  // Execute stage 
  input  logic [2:0] 	      Funct3E,       // Funct fields of instruction specify type of operations
  input  logic 		          IntDivE, W64E, // Integer division on FPU
  input  logic [`XLEN-1:0]  ForwardedSrcAE, ForwardedSrcBE, // Integer input for convert, move, and int div (from IEU)
  input  logic [4:0] 	      RdE,           // which FP register to write to (from IEU)
  output logic 		          FWriteIntE,    // integer register write enable (to IEU)
  output logic              FCvtIntE,      // Convert to int (to IEU)
  // Memory stage 
  input  logic [2:0] 	      Funct3M,       // Funct fields of instruction specify type of operations
  input  logic [4:0] 	      RdM,           // which FP register to write to (from IEU)
  output logic 		          FRegWriteM,    // FP register write enable (to privileged unit)
  output logic 		          FpLoadStoreM,  // Fp load instruction? (to LSU)
  output logic [`FLEN-1:0]  FWriteDataM,   // Data to be written to memory (to LSU) 
  output logic [`XLEN-1:0]  FIntResM,      // data to be written to integer register (to IEU)
  output logic 		          IllegalFPUInstrD, // Is the instruction an illegal fpu instruction (to IFU)
  output logic [4:0] 	      SetFflagsM,    // FPU flags (to privileged unit)
  // Writeback stage 
  input  logic [4:0] 	      RdW,           // which FP register to write to (from IEU)
  input  logic [`FLEN-1:0]  ReadDataW,     // Read data (from LSU)
  output logic [`XLEN-1:0]  FCvtIntResW,   // convert result to to be written to integer register (to IEU)
  output logic              FCvtIntW,      // select FCvtIntRes (to IEU)
  output logic [`XLEN-1:0]  FIntDivResultW // Result from integer division (to IEU)
);

  // RISC-V FPU specifics:
  //    - multiprecision support uses NAN-boxing, putting 1's in unused msbs
  //    - RISC-V detects underflow after rounding

  // control signals
  logic 		            FRegWriteW;                        // FP register write enable
  logic [2:0] 	        FrmM;                              // FP rounding mode
  logic [`FMTBITS-1:0]  FmtE, FmtM;                        // FP precision 0-single 1-double
  logic 		            FDivStartE, IDivStartE;            // Start division or squareroot
  logic 		            FWriteIntM;                        // Write to integer register
  logic [1:0] 	        ForwardXE, ForwardYE, ForwardZE;   // forwarding mux control signals
  logic [2:0] 	        OpCtrlE, OpCtrlM;                  // Select which opperation to do in each component
  logic [1:0] 	        FResSelE, FResSelM, FResSelW;      // Select one of the results that finish in the memory stage
  logic [1:0] 	        PostProcSelE, PostProcSelM;        // select result in the post processing unit
  logic [4:0] 	        Adr1D, Adr2D, Adr3D;               // register adresses of each input
  logic [4:0] 	        Adr1E, Adr2E, Adr3E;                // register adresses of each input
  logic                 XEnD, YEnD, ZEnD;                  // X, Y, Z inputs used for current operation
  logic                 XEnE, YEnE, ZEnE;                  // X, Y, Z inputs used for current operation
  logic                 FRegWriteE;                        // Write floating-point register

  // regfile signals
  logic [`FLEN-1:0] FRD1D, FRD2D, FRD3D;                  // Read Data from FP register - decode stage
  logic [`FLEN-1:0] FRD1E, FRD2E, FRD3E;                  // Read Data from FP register - execute stage
  logic [`FLEN-1:0] XE;                                   // Input 1 to the various units (after forwarding)
  logic [`XLEN-1:0] IntSrcXE;                             // Input 1 to the various units (after forwarding)
  logic [`FLEN-1:0] PreYE, YE;                            // Input 2 to the various units (after forwarding)
  logic [`FLEN-1:0] PreZE, ZE;                            // Input 3 to the various units (after forwarding)

  // unpacking signals
  logic 		        XsE, YsE, ZsE;                        // input's sign - execute stage
  logic 		        XsM, YsM;                             // input's sign - memory stage
  logic [`NE-1:0]   XeE, YeE, ZeE;                        // input's exponent - execute stage
  logic [`NE-1:0] 	ZeM;                                  // input's exponent - memory stage
  logic [`NF:0] 	  XmE, YmE, ZmE;                        // input's significand - execute stage
  logic [`NF:0] 	  XmM, YmM, ZmM;                        // input's significand - memory stage
  logic 		        XNaNE, YNaNE, ZNaNE;                  // is the input a NaN - execute stage
  logic 		        XNaNM, YNaNM, ZNaNM;                  // is the input a NaN - memory stage
  logic 		        XSNaNE, YSNaNE, ZSNaNE;               // is the input a signaling NaN - execute stage
  logic 		        XSNaNM, YSNaNM, ZSNaNM;               // is the input a signaling NaN - memory stage
  logic 		        XSubnormE;                            // is the input subnormal
  logic 		        XZeroE, YZeroE, ZZeroE;               // is the input zero - execute stage
  logic 		        XZeroM, YZeroM;                       // is the input zero - memory stage
  logic 		        XInfE, YInfE, ZInfE;                  // is the input infinity - execute stage
  logic 		        XInfM, YInfM, ZInfM;                  // is the input infinity - memory stage
  logic 		        XExpMaxE;                             // is the exponent all ones (max value)
  logic [`FLEN-1:0] XPostBoxE;                            // X after fixing bad NaN box.  Needed for 1-input operations

  // Fma Signals
  logic             FmaAddSubE;                           // Multiply by 1.0 when adding or subtracting
  logic [1:0]       FmaZSelE;                             // Select Z = Y when adding or subtracting, 0 when multiplying
  logic [3*`NF+3:0] SmE, SmM;                             // Sum significand
  logic 			      FmaAStickyE, FmaAStickyM;             // FMA addend sticky bit output
  logic [`NE+1:0]   SeE,SeM;                              // Sum exponent
  logic 			      InvAE, InvAM;                         // Invert addend
  logic 			      AsE, AsM;                             // Addend sign
  logic 			      PsE, PsM;                             // Product sign
  logic 			      SsE, SsM;                             // Sum sign
  logic [$clog2(3*`NF+5)-1:0] SCntE, SCntM;               // LZA sum leading zero count
  
  // Cvt Signals
  logic [`NE:0]           CeE, CeM;                       // convert intermediate expoent
  logic [`LOGCVTLEN-1:0]  CvtShiftAmtE, CvtShiftAmtM;     // how much to shift by
  logic                   CvtResSubnormUfE, CvtResSubnormUfM; // does the result underflow or is subnormal
  logic                   CsE, CsM;                       // convert result sign
  logic                   IntZeroE, IntZeroM;             // is the integer zero?
  logic [`CVTLEN-1:0]     CvtLzcInE, CvtLzcInM;           // input to the Leading Zero Counter (priority encoder)
  logic [`XLEN-1:0]       FCvtIntResM;                    // fcvt integer result (for IEU)
  
  // divide signals
  logic [`DIVb:0]      QmM;                               // fdivsqrt signifcand
  logic [`NE+1:0]      QeM;                               // fdivsqrt exponent
  logic                DivStickyM;                        // fdivsqrt sticky bit
  logic                FDivDoneE, IFDivStartE;            // fdivsqrt control signals
  logic [`XLEN-1:0]    FIntDivResultM;                    // fdivsqrt integer division result (for IEU)

  // result and flag signals
  logic [`XLEN-1:0] ClassResE;                            // classify result
  logic [`FLEN-1:0] CmpFpResE;                            // compare result to FPU (min/max)
  logic [`XLEN-1:0] CmpIntResE;                           // compare result to IEU (eq/lt/le)
  logic 		        CmpNVE;                               // compare invalid flag (Not Valid)     
  logic [`FLEN-1:0] SgnResE;                              // sign injection result
  logic [`XLEN-1:0] FIntResE;                             // FPU to IEU E-stage result (classify, compare, move)
  logic [`FLEN-1:0] PostProcResM;                         // Postprocessor output
  logic [4:0] 	    PostProcFlgM;                         // Postprocessor flags
  logic  	          PreNVE, PreNVM;                       // selected flag that is ready in the memory stage     
  logic [`FLEN-1:0] FpResM, FpResW;                       // FPU preliminary result
  logic [`FLEN-1:0] PreFpResE, PreFpResM;                 // selected result that is ready in the memory stage
  logic [`FLEN-1:0] FResultW;                             // final FP result being written to the FP register   

  // other signals
  logic [`FLEN-1:0] AlignedSrcAE;                         // align SrcA from IEU to the floating point format for fmv
  logic [`FLEN-1:0] BoxedZeroE;                           // Zero value for Z for multiplication, with NaN boxing if needed
  logic [`FLEN-1:0] BoxedOneE;                            // One value for Z for multiplication, with NaN boxing if needed
  logic             StallUnpackedM;                       // Stall unpacker outputs during multicycle fdivsqrt
  logic [`FLEN-1:0] SgnExtXE;                             // Sign-extended X input for move to integer

  //////////////////////////////////////////////////////////////////////////////////////////
  // Decode Stage: fctrl decoder, read register file
  //////////////////////////////////////////////////////////////////////////////////////////

  // calculate FP control signals
  fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
              .Funct3E, .IntDivE, .InstrD,
              .StallE, .StallM, .StallW, .FlushE, .FlushM, .FlushW, .FRM_REGW, .STATUS_FS, .FDivBusyE,
              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .FrmM, .FmtE, .FmtM,
              .FDivStartE, .IDivStartE, .FWriteIntE, .FCvtIntE, .FWriteIntM, .OpCtrlE, .OpCtrlM, .FpLoadStoreM,
              .IllegalFPUInstrD, .XEnD, .YEnD, .ZEnD, .XEnE, .YEnE, .ZEnE,
              .FResSelE, .FResSelM, .FResSelW, .PostProcSelE, .PostProcSelM, .FCvtIntW, 
              .Adr1D, .Adr2D, .Adr3D, .Adr1E, .Adr2E, .Adr3E);

  // FP register file
  fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
    .a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), 
    .a4(RdW), .wd4(FResultW),
    .rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	

  // D/E pipeline registers  
  flopenrc #(`FLEN) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
  flopenrc #(`FLEN) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
  flopenrc #(`FLEN) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);

  //////////////////////////////////////////////////////////////////////////////////////////
  // Execute Stage: hazards, forwarding, unpacking, execution units
  //////////////////////////////////////////////////////////////////////////////////////////

  // Hazard unit for FPU: determines if any forwarding or stalls are needed
  fhazard fhazard(.Adr1D, .Adr2D, .Adr3D, .Adr1E, .Adr2E, .Adr3E, 
    .FRegWriteE, .FRegWriteM, .FRegWriteW, .RdE, .RdM, .RdW, .FResSelM, 
    .XEnD, .YEnD, .ZEnD, .FPUStallD, .ForwardXE, .ForwardYE, .ForwardZE);

  // forwarding muxs
  mux3  #(`FLEN)  fxemux (FRD1E, FResultW, PreFpResM, ForwardXE, XE);
  mux3  #(`FLEN)  fyemux (FRD2E, FResultW, PreFpResM, ForwardYE, PreYE);
  mux3  #(`FLEN)  fzemux (FRD3E, FResultW, PreFpResM, ForwardZE, PreZE);

  // Select NAN-boxed value of Y = 1.0 in proper format for fma to add/subtract X*Y+Z
  if(`FPSIZES == 1) assign BoxedOneE = {2'b0, {`NE-1{1'b1}}, (`NF)'(0)};
  else if(`FPSIZES == 2) 
      mux2 #(`FLEN) fonemux ({{`FLEN-`LEN1{1'b1}}, 2'b0, {`NE1-1{1'b1}}, (`NF1)'(0)}, {2'b0, {`NE-1{1'b1}}, (`NF)'(0)}, FmtE, BoxedOneE); // NaN boxing zeroes
  else if(`FPSIZES == 3 | `FPSIZES == 4) 
      mux4 #(`FLEN) fonemux ({{`FLEN-`S_LEN{1'b1}}, 2'b0, {`S_NE-1{1'b1}}, (`S_NF)'(0)}, 
                          {{`FLEN-`D_LEN{1'b1}}, 2'b0, {`D_NE-1{1'b1}}, (`D_NF)'(0)}, 
                          {{`FLEN-`H_LEN{1'b1}}, 2'b0, {`H_NE-1{1'b1}}, (`H_NF)'(0)}, 
                          {2'b0, {`NE-1{1'b1}}, (`NF)'(0)}, FmtE, BoxedOneE); // NaN boxing zeroes
  assign FmaAddSubE = OpCtrlE[2]&OpCtrlE[1]&(FResSelE==2'b01)&(PostProcSelE==2'b10);
  mux2  #(`FLEN)  fyaddmux (PreYE, BoxedOneE, FmaAddSubE, YE); // Force Y to be 1 for add/subtract
  
  // Select NAN-boxed value of Z = 0.0 in proper format for FMA for multiply X*Y+Z
  // For add and subtract, Z comes from second source operand
 if(`FPSIZES == 1) assign BoxedZeroE = 0;
  else if(`FPSIZES == 2) 
    mux2 #(`FLEN) fmulzeromux ({{`FLEN-`LEN1{1'b1}}, {`LEN1{1'b0}}}, (`FLEN)'(0), FmtE, BoxedZeroE); // NaN boxing zeroes
  else if(`FPSIZES == 3 | `FPSIZES == 4)
    mux4 #(`FLEN) fmulzeromux ({{`FLEN-`S_LEN{1'b1}}, {`S_LEN{1'b0}}}, 
                                {{`FLEN-`D_LEN{1'b1}}, {`D_LEN{1'b0}}}, 
                                {{`FLEN-`H_LEN{1'b1}}, {`H_LEN{1'b0}}}, 
                                (`FLEN)'(0), FmtE, BoxedZeroE); // NaN boxing zeroes
  assign FmaZSelE = {OpCtrlE[2]&OpCtrlE[1], OpCtrlE[2]&~OpCtrlE[1]};
  mux3  #(`FLEN)  fzmulmux (PreZE, BoxedZeroE, PreYE, FmaZSelE, ZE);

  // unpack unit: splits FP inputs into their parts and classifies SNaN, NaN, Subnorm, Norm, Zero, Infifnity
  unpack unpack (.X(XE), .Y(YE), .Z(ZE), .Fmt(FmtE), .Xs(XsE), .Ys(YsE), .Zs(ZsE), 
    .Xe(XeE), .Ye(YeE), .Ze(ZeE), .Xm(XmE), .Ym(YmE), .Zm(ZmE), .YEn(YEnE),
    .XNaN(XNaNE), .YNaN(YNaNE), .ZNaN(ZNaNE), .XSNaN(XSNaNE), .XEn(XEnE), 
    .YSNaN(YSNaNE), .ZSNaN(ZSNaNE), .XSubnorm(XSubnormE), 
    .XZero(XZeroE), .YZero(YZeroE), .ZZero(ZZeroE), .XInf(XInfE), .YInf(YInfE), 
    .ZEn(ZEnE), .ZInf(ZInfE), .XExpMax(XExpMaxE), .XPostBox(XPostBoxE));
  
  // fused multiply add: fadd/sub, fmul, fmadd/fnmadd/fmsub/fnmsub
  fma fma (.Xs(XsE), .Ys(YsE), .Zs(ZsE), .Xe(XeE), .Ye(YeE), .Ze(ZeE), .Xm(XmE), .Ym(YmE), .Zm(ZmE), 
    .XZero(XZeroE), .YZero(YZeroE), .ZZero(ZZeroE), .OpCtrl(OpCtrlE), 
    .As(AsE), .Ps(PsE), .Ss(SsE), .Se(SeE), .Sm(SmE), .InvA(InvAE), .SCnt(SCntE), .ASticky(FmaAStickyE)); 

  // divide and square root: fdiv, fsqrt, optionally integer division
  fdivsqrt fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
    .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .FDivStartE, .IDivStartE, .XsE,
    .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .IntDivE, .W64E,
    .StallM, .FlushE, .DivStickyM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .QeM, 
    .QmM, .FIntDivResultM);

  // compare: fmin/fmax, flt/fle/feq
  fcmp fcmp (.Fmt(FmtE), .OpCtrl(OpCtrlE), .Xs(XsE), .Ys(YsE), .Xe(XeE), .Ye(YeE), 
    .Xm(XmE), .Ym(YmE), .XZero(XZeroE), .YZero(YZeroE), .XNaN(XNaNE), .YNaN(YNaNE), 
    .XSNaN(XSNaNE), .YSNaN(YSNaNE), .X(XE), .Y(YE), .CmpNV(CmpNVE), 
    .CmpFpRes(CmpFpResE), .CmpIntRes(CmpIntResE));

  // sign injection: fsgnj/fsgnjx/fsgnjn
  fsgninj fsgninj(.OpCtrl(OpCtrlE[1:0]), .Xs(XsE), .Ys(YsE), .X(XPostBoxE), .Fmt(FmtE), .SgnRes(SgnResE));

  // classify: fclass
  fclassify fclassify (.Xs(XsE), .XSubnorm(XSubnormE), .XZero(XZeroE), .XNaN(XNaNE), 
    .XInf(XInfE), .XSNaN(XSNaNE), .ClassRes(ClassResE));

  // convert: fcvt.*.*
  fcvt fcvt (.Xs(XsE), .Xe(XeE), .Xm(XmE), .Int(ForwardedSrcAE), .OpCtrl(OpCtrlE), 
    .ToInt(FWriteIntE), .XZero(XZeroE), .Fmt(FmtE), .Ce(CeE), .ShiftAmt(CvtShiftAmtE), 
    .ResSubnormUf(CvtResSubnormUfE), .Cs(CsE), .IntZero(IntZeroE), .LzcIn(CvtLzcInE));


  // NaN Box SrcA to convert integer to requested FP size
  if(`FPSIZES == 1) assign AlignedSrcAE = {{`FLEN-`XLEN{1'b1}}, ForwardedSrcAE};
  else if(`FPSIZES == 2) 
    mux2 #(`FLEN) SrcAMux ({{`FLEN-`LEN1{1'b1}}, ForwardedSrcAE[`LEN1-1:0]}, {{`FLEN-`XLEN{1'b1}}, ForwardedSrcAE}, FmtE, AlignedSrcAE);
  else if(`FPSIZES == 3 | `FPSIZES == 4)
    mux4 #(`FLEN) SrcAMux ({{`FLEN-`S_LEN{1'b1}}, ForwardedSrcAE[`S_LEN-1:0]}, 
                            {{`FLEN-`D_LEN{1'b1}}, ForwardedSrcAE[`D_LEN-1:0]}, 
                            {{`FLEN-`H_LEN{1'b1}}, ForwardedSrcAE[`H_LEN-1:0]}, 
                            {{`FLEN-`XLEN{1'b1}}, ForwardedSrcAE}, FmtE, AlignedSrcAE); // NaN boxing zeroes

  // select a result that may be written to the FP register
  mux3  #(`FLEN) FResMux(SgnResE, AlignedSrcAE, CmpFpResE, {OpCtrlE[2], &OpCtrlE[1:0]}, PreFpResE);
  assign PreNVE = CmpNVE&(OpCtrlE[2]|FWriteIntE);

  // select the result that may be written to the integer register - to IEU
  if(`FPSIZES == 1)
    assign SgnExtXE = XE;
  else if(`FPSIZES == 2) 
    mux2 #(`FLEN) sgnextmux ({{`FLEN-`LEN1{XsE}}, XE[`LEN1-1:0]}, XE, FmtE, SgnExtXE);
  else if(`FPSIZES == 3 | `FPSIZES == 4)
    mux4 #(`FLEN) fmulzeromux ({{`FLEN-`H_LEN{XsE}}, XE[`H_LEN-1:0]}, 
                                {{`FLEN-`S_LEN{XsE}}, XE[`S_LEN-1:0]}, 
                                {{`FLEN-`D_LEN{XsE}}, XE[`D_LEN-1:0]}, 
                                XE, FmtE, SgnExtXE); 

  if (`FLEN>`XLEN)
    assign IntSrcXE = SgnExtXE[`XLEN-1:0];
  else 
    assign IntSrcXE = {{`XLEN-`FLEN{XsE}}, SgnExtXE};
  mux3 #(`XLEN) IntResMux (ClassResE, IntSrcXE, CmpIntResE, {~FResSelE[1], FResSelE[0]}, FIntResE);

  // E/M pipe registers

  // Need to stall during divsqrt iterations to avoid capturing bad flags from stale forwarded sources
  assign StallUnpackedM = StallM | (FDivBusyE & ~IFDivStartE | FDivDoneE); 

  flopenrc #(`NF+1) EMFpReg2 (clk, reset, FlushM, ~StallM, XmE, XmM);
  flopenrc #(`NF+1) EMFpReg3 (clk, reset, FlushM, ~StallM, YmE, YmM);
  flopenrc #(`FLEN) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZeE,ZmE}, {ZeM,ZmM});
  flopenrc #(`XLEN) EMFpReg6 (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
  flopenrc #(`FLEN) EMFpReg7 (clk, reset, FlushM, ~StallM, PreFpResE, PreFpResM);
  flopenr #(13) EMFpReg5 (clk, reset, ~StallUnpackedM, 
    {XsE, YsE, XZeroE, YZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
    {XsM, YsM, XZeroM, YZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
  flopenrc #(1)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, PreNVE, PreNVM);      
  flopenrc #(3*`NF+4) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM);
  flopenrc #($clog2(3*`NF+5)+7+`NE) EMRegFma4(clk, reset, FlushM, ~StallM,
    {FmaAStickyE, InvAE, SCntE, AsE, PsE, SsE, SeE},
    {FmaAStickyM, InvAM, SCntM, AsM, PsM, SsM, SeM});
  flopenrc #(`NE+`LOGCVTLEN+`CVTLEN+4) EMRegCvt(clk, reset, FlushM, ~StallM, 
    {CeE, CvtShiftAmtE, CvtResSubnormUfE, CsE, IntZeroE, CvtLzcInE},
    {CeM, CvtShiftAmtM, CvtResSubnormUfM, CsM, IntZeroM, CvtLzcInM});
  flopenrc #(`FLEN) FWriteDataMReg (clk, reset, FlushM, ~StallM, YE, FWriteDataM);

  //////////////////////////////////////////////////////////////////////////////////////////
  // Memory Stage: postprocessor and result muxes
  //////////////////////////////////////////////////////////////////////////////////////////

  postprocess postprocess(.Xs(XsM), .Ys(YsM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), 
    .FmaASticky(FmaAStickyM), .XZero(XZeroM), .YZero(YZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
    .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), 
    .FmaSm(SmM), .DivQe(QeM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
    .CvtCe(CeM), .CvtResSubnormUf(CvtResSubnormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CsM), 
    .ToInt(FWriteIntM), .DivSticky(DivStickyM), .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), 
    .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));

  // FPU flag selection - to privileged
  mux2  #(5)      FPUFlgMux({PreNVM&~FResSelM[1], 4'b0}, PostProcFlgM, ~FResSelM[1]&FResSelM[0], SetFflagsM);
  mux2  #(`FLEN)  FPUResMux(PreFpResM, PostProcResM, FResSelM[0], FpResM);

  // M/W pipe registers
  flopenrc #(`FLEN) MWRegFp(clk, reset, FlushW, ~StallW, FpResM, FpResW); 
  flopenrc #(`XLEN) MWRegIntCvtRes(clk, reset, FlushW, ~StallW, FCvtIntResM, FCvtIntResW); 
  flopenrc #(`XLEN) MWRegIntDivRes(clk, reset, FlushW, ~StallW, FIntDivResultM, FIntDivResultW); 

  //////////////////////////////////////////////////////////////////////////////////////////
  // Writeback Stage: result mux
  //////////////////////////////////////////////////////////////////////////////////////////

  // select the result to be written to the FP register
  mux2  #(`FLEN)  FResultMux (FpResW, ReadDataW, FResSelW[1], FResultW);

endmodule // fpu