///////////////////////////////////////////
//
// Written: Katherine Parry, James Stine, Brett Mathis
// Modified: 6/23/2021
//
// Purpose: FPU
// 
// A component of the Wally configurable RISC-V project.
// 
// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
// is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
///////////////////////////////////////////

`include "wally-config.vh"

module fpu (
  input logic 		   clk,
  input logic 		   reset,
  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
  input logic [31:0] 	   InstrD, // instruction from IFU
  input logic [`XLEN-1:0]  ReadDataW,// Read data from memory
  input logic [`XLEN-1:0]  SrcAE, // Integer input being processed (from IEU)
  input logic [`XLEN-1:0]  SrcAM, // Integer input being written into fpreg (from IEU)
  input logic 		   StallE, StallM, StallW, // stall signals from HZU
  input logic 		   FlushE, FlushM, FlushW, // flush signals from HZU
  input logic [4:0] 	   RdE, RdM, RdW, // which FP register to write to (from IEU)
  output logic 		   FRegWriteM, // FP register write enable
  output logic 		   FStallD, // Stall the decode stage
  output logic 		   FWriteIntE, FWriteIntM, FWriteIntW, // integer register write enable
  output logic [`XLEN-1:0] FWriteDataE, // Data to be written to memory
  output logic [`XLEN-1:0] FIntResM, // data to be written to integer register
  output logic 		   FDivBusyE, // Is the divide/sqrt unit busy (stall execute stage)
  output logic 		   IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
  output logic [4:0] 	   SetFflagsM        // FMA flags (to privileged unit)
  );

  //*** make everything FLEN at some point
  //*** add the 128 bit support to the if statement when needed
  //*** make new tests for fp using testfloat that include flag checking and all rounding modes
  //*** what is the format for 16-bit - finding conflicting info online can't find anything specified in spec
  //*** only fma/mul and fp <-> int convert flags have been tested. test the others.

  // FPU specifics:
  //    - uses NaN-blocking format
  //        - if there are any unsused bits the most significant bits are filled with 1s
  //                single stored in a double: | 32 1s | single precision value |
  //    - sets the underflow after rounding
  
  generate if (`F_SUPPORTED | `D_SUPPORTED) begin : fpu

     // control signals
     logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
     logic [2:0] 	  FrmD, FrmE, FrmM;                   // FP rounding mode
     logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
     logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
     logic 		  FWriteIntD;                         // Write to integer register
     logic [1:0] 	  FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
     logic [1:0] 	  FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select the result written to FP register
     logic [2:0] 	  FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
     logic [2:0] 	  FResSelD, FResSelE, FResSelM;           // Select one of the results that finish in the memory stage
     logic [1:0] 	  FIntResSelD, FIntResSelE, FIntResSelM;  // Select the result written to the integer resister
     logic [4:0] 	  Adr1E, Adr2E, Adr3E;                    // adresses of each input
     
     // regfile signals
     logic [63:0] 	  FRD1D, FRD2D, FRD3D;  // Read Data from FP register - decode stage
     logic [63:0] 	  FRD1E, FRD2E, FRD3E;  // Read Data from FP register - execute stage
     logic [63:0] 	  FSrcXE, FSrcXM;       // Input 1 to the various units (after forwarding)
     logic [63:0] 	  FPreSrcYE, FSrcYE;               // Input 2 to the various units (after forwarding)
     logic [63:0] 	  FPreSrcZE, FSrcZE;     // Input 3 to the various units (after forwarding)
     
     // unpacking signals
     logic 		  XSgnE, YSgnE, ZSgnE;     // input's sign - execute stage
     logic 		  XSgnM, YSgnM;     // input's sign - memory stage
     logic [10:0] 	  XExpE, YExpE, ZExpE;     // input's exponent - execute stage
     logic [10:0] 	  XExpM, YExpM, ZExpM;     // input's exponent - memory stage
     logic [52:0] 	  XManE, YManE, ZManE;  // input's fraction - execute stage
     logic [52:0] 	  XManM, YManM, ZManM;  // input's fraction - memory stage
     logic [10:0] 	  BiasE;                   // bias based on precision (single=7f double=3ff - max expoent/2)
     logic 		  XNaNE, YNaNE, ZNaNE;           // is the input a NaN - execute stage
     logic 		  XNaNM, YNaNM, ZNaNM;           // is the input a NaN - memory stage
     logic       XNaNQ, YNaNQ;                  // is the input a NaN - divide
     logic 		  XSNaNE, YSNaNE, ZSNaNE;        // is the input a signaling NaN - execute stage
     logic 		  XSNaNM, YSNaNM, ZSNaNM;        // is the input a signaling NaN - memory stage
     logic 		  XDenormE, YDenormE, ZDenormE;  // is the input denormalized
     logic 		  XZeroE, YZeroE, ZZeroE;        // is the input zero - execute stage
     logic 		  XZeroM, YZeroM, ZZeroM;        // is the input zero - memory stage
     logic       XZeroQ, YZeroQ;                // is the input zero - divide
     logic 		  XInfE, YInfE, ZInfE;           // is the input infinity - execute stage
     logic 		  XInfM, YInfM, ZInfM;           // is the input infinity - memory stage
     logic       XInfQ, YInfQ;                  // is the input infinity - divide
     logic 		  XExpMaxE;                      // is the exponent all ones (max value)
     logic 		  XNormE;                 // is normal     
     
     // result and flag signals
     logic [63:0] 	  FDivResM, FDivResW; // divide/squareroot result
     logic [4:0] 	  FDivFlgM, FDivFlgW; // divide/squareroot flags  
     logic [63:0] 	  FMAResM, FMAResW;   // FMA/multiply result
     logic [4:0] 	  FMAFlgM, FMAFlgW;   // FMA/multiply result	
     logic [63:0] 	  ReadResW;           // read result (load instruction)
     logic [63:0] 	  CvtFpResE, CvtFpResM, CvtFpResW; // add/FP -> FP convert result
     logic [4:0] 	  CvtFpFlgE, CvtFpFlgM, CvtFpFlgW; // add/FP -> FP convert flags
     logic [63:0] 	  CvtResE, CvtResM;   // FP <-> int convert result
     logic [4:0] 	  CvtFlgE, CvtFlgM;   // FP <-> int convert flags //*** trim this	
     logic [63:0] 	  ClassResE, ClassResM; // classify result
     logic [63:0] 	  CmpResE, CmpResM; // compare result
     logic 		  CmpNVE, CmpNVM;   // compare invalid flag (Not Valid)     
     logic [63:0] 	  SgnResE, SgnResM; // sign injection result
     logic 		  SgnNVE, SgnNVM;   // sign injection invalid flag (Not Valid)     
     logic [63:0] 	  FResE, FResM, FResW;     // selected result that is ready in the memory stage
     logic [4:0] 	  FFlgE, FFlgM;            // selected flag that is ready in the memory stage     
     logic [`XLEN-1:0] 	  FIntResE;     
     logic [63:0] 	  FPUResultW;    // final FP result being written to the FP register
     
     // other signals
     logic 		  FDivSqrtDoneE;          // is divide done
     logic [63:0] 	  DivInput1E, DivInput2E; // inputs to divide/squareroot unit
     logic 		  FDivClk;                // clock for divide/squareroot unit
     logic [63:0] 	  AlignedSrcAE;           // align SrcA to the floating point format

     // DECODE STAGE
     // calculate FP control signals
     fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
		  .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
		  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
	
     // FP register file
     //    - can read 3 registers and write 1 register every cycle
     fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
			.a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), 
			.wd4(FPUResultW),
			.rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	

     // D/E pipeline registers
     flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
     flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
     flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
     flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
                             {Adr1E, Adr2E, Adr3E});
     flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
			       {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
			       {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});

     // EXECUTION STAGE
     // Hazard unit for FPU  
     //    - determines if any forwarding or stalls are needed
     fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
                     .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
     
     // forwarding muxs
     mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
     mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
     mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
     mux3  #(64)  fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
			   {2'b0, {10{1'b1}}, 52'b0}, 
			   {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01)}, 
			   FSrcYE); // Force Z to be 0 for multiply instructions
     // Force Z to be 0 for multiply instructions     
     mux3  #(64)  fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
       
     // unpacking unit
     //    - splits FP inputs into their various parts
     //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
     unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
			 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
			 .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
			 .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
     
     // FMA
     //   - two stage FMA
     //   - execute stage - multiplication and addend shifting
     //   - memory stage  - addition and rounding
     //   - handles FMA and multiply instructions
     fma fma (.clk, .reset, .FlushM, .StallM, 
	      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
	      .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE,
	      .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
	      .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
	      .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
	      .FOpCtrlE,
	      .FmtE, .FmtM, .FrmM, 
	      .FMAFlgM, .FMAResM);
     
     // capture the inputs for divide/sqrt
     floprc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
			      .clear(FDivSqrtDoneE),
			      .reset(reset),  .clk(FDivBusyE));
     floprc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
			      .clear(FDivSqrtDoneE),
			      .reset(reset),  .clk(FDivBusyE));
     floprc #(6) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE}), 
			     .q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ}),
			     .clear(FDivSqrtDoneE),
			     .reset(reset),  .clk(FDivBusyE));            
     // fpdivsqrt using Goldschmidt's iteration
     fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
		      .reset, .clk(clk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
		      .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ,
		      .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));

     // convert from signle to double and vice versa
     cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
     
     // compare unit
     //    - computation is done in one stage
     //    - writes to FP file durring min/max instructions
     //    - other comparisons write a 1 or 0 to the integer register
     fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
		.FSrcXE, .FSrcYE, .FOpCtrlE, 
		.FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
		.Invalid(CmpNVE), .CmpResE);
     
     // sign injection unit
     //    - computation is done in one stage
     fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
		.SgnNVE, .SgnResE);
     
     // classify
     //    - computation is done in one stage
     //    - most of the work is done in the unpacking unit
     //    - result is written to the integer register
     fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
			  .XSNaNE, .ClassResE);
     
     fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE,
		.CvtResE, .CvtFlgE);
     
     // data to be stored in memory - to IEU
     //    - FP uses NaN-blocking format
     //        - if there are any unsused bits the most significant bits are filled with 1s
     assign FWriteDataE = FSrcYE[`XLEN-1:0];     
     
     // Align SrcA to MSB when single precicion
     mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAE[31:0]}, {{64-`XLEN{1'b1}}, SrcAE}, FmtE, AlignedSrcAE);
     
     // select a result that may be written to the FP register
     mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
     mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
     
     // select the result that may be written to the integer register - to IEU
     mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], 
			       CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
     
     // E/M pipe registers

     // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
     flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
     flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
     flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
     flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, 
			     {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
			     {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
     flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, FResE, FResM); 
     flopenrc #(5)  EMRegCmpFlg(clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
     flopenrc #(`XLEN) EMRegSgnRes(clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
     flopenrc #(11) EMCtrlReg(clk, reset, FlushM, ~StallM,
			      {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
			      {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
     
     // BEGIN MEMORY STAGE
     // FPU flag selection - to privileged
     mux4  #(5)  FPUFlgMux(5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
  
     // M/W pipe registers
     flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
     flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
     flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, CvtFpResM, CvtFpResW); 
     flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
     flopenrc #(5)  MWCtrlReg(clk, reset, FlushW, ~StallW,
			      {FRegWriteM, FResultSelM, FmtM, FWriteIntM},
			      {FRegWriteW, FResultSelW, FmtW, FWriteIntW});
     
     // BEGIN WRITEBACK STAGE
     
     // put ReadData into NaN-blocking format
     //    - if there are any unsused bits the most significant bits are filled with 1s
     //    - for load instruction
     mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
     
     // select the result to be written to the FP register
     mux4  #(64)  FPUResultMux(ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);

  end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
     assign FStallD = 0;
     assign FWriteIntE = 0; 
     assign FWriteIntM = 0;
     assign FWriteIntW = 0;
     assign FWriteDataE = 0;
     assign FIntResM = 0;
     assign FDivBusyE = 0;
     assign IllegalFPUInstrD = 1;
     assign SetFflagsM = 0;
  end
  endgenerate 
   
endmodule // fpu