///////////////////////////////////////////
// round.sv
//
// Written: me@KatherineParry.com
// Modified: 7/5/2022
//
// Purpose: Rounder
// 
// Documentation: RISC-V System on Chip Design Chapter 13
//
// A component of the CORE-V-WALLY configurable RISC-V project.
// 
// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
//
// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
//
// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
// may obtain a copy of the License at
//
// https://solderpad.org/licenses/SHL-2.1/
//
// Unless required by applicable law or agreed to in writing, any work distributed under the 
// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
// either express or implied. See the License for the specific language governing permissions 
// and limitations under the License.
////////////////////////////////////////////////////////////////////////////////////////////////

module round import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.FMTBITS-1:0]     OutFmt,             // output format
  input  logic [2:0]              Frm,                // rounding mode
  input  logic [1:0]              PostProcSel,        // select the postprocessor output
  input  logic                    Ms,                 // normalized sign
  input  logic [P.CORRSHIFTSZ-1:0] Mf,                 // normalized fraction
  // fma
  input  logic                    FmaOp,              // is an fma opperation being done?
  input  logic [P.NE+1:0]          FmaMe,              // exponent of the normalized sum for fma
  input  logic                    FmaASticky,         // addend's sticky bit
  // divsqrt
  input  logic                    DivOp,              // is a division opperation being done
  input  logic                    DivSticky,          // divsqrt sticky bit
  input  logic [P.NE+1:0]          Qe,                 // the divsqrt calculated expoent
  // cvt
  input  logic                    CvtOp,              // is a convert opperation being done
  input  logic                    ToInt,              // is the cvt op a cvt to integer
  input  logic                    CvtResSubnormUf,    // is the cvt result subnormal or underflow
  input  logic                    CvtResUf,           // does the cvt result underflow
  input  logic [P.NE:0]            CvtCe,              // the cvt calculated expoent
  // outputs
  output logic [P.NE+1:0]          Me,                 // normalied fraction
  output logic                    UfPlus1,            // do you add one to the result if given an unbounded exponent
  output logic [P.NE+1:0]          FullRe,             // Re with bits to determine sign and overflow
  output logic [P.NE-1:0]          Re,                 // Result exponent
  output logic [P.NF-1:0]          Rf,                 // Result fractionNormS
  output logic                    Sticky,             // sticky bit
  output logic                    Plus1,              // do you add one to the final result
  output logic                    Round, Guard        // bits needed to calculate rounding
);

  logic           UfCalcPlus1;        // calculated plus one for unbounded exponent
  logic           NormSticky;         // normalized sum's sticky bit
  logic [P.NF-1:0] RoundFrac;          // rounded fraction
  logic           FpRes;              // is the result a floating point
  logic           IntRes;             // is the result an integer
  logic           FpGuard, FpRound;   // floating point round/guard bits
  logic           FpLsbRes;           // least significant bit of floating point result
  logic           LsbRes;             // lsb of result
  logic           CalcPlus1;          // calculated plus1
  logic           FpPlus1;            // do you add one to the fp result 
  logic [P.FLEN:0] RoundAdd;           // how much to add to the result

// what position is XLEN in?
//  options: 
//     1: XLEN > NF   > NF1
//     2: NF   > XLEN > NF1
//     3: NF   > NF1  > XLEN
//  single and double will always be smaller than XLEN
//`define XLENPOS ((`XLEN>`NF) ? 1 : (`XLEN>`NF1) ? 2 : 3)
  localparam XLENPOS = P.XLEN > P.NF ? 1 : P.XLEN > P.NF1 ? 2 : 3;
  
  ///////////////////////////////////////////////////////////////////////////////
  // Rounding
  ///////////////////////////////////////////////////////////////////////////////

  // round to nearest even
  //      {Round, Sticky}
  //      0x - do nothing
  //      10 - tie - Plus1 if result is odd  (LSBNormSum = 1)
  //          - don't add 1 if a small number was supposed to be subtracted
  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
  //         - plus 1 otherwise

  //  round to zero - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0

  //  round to -infinity
  //          - Plus1 if negative unless a small number was supposed to be subtracted from a result with guard and round bits of 0
  //          - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0

  //  round to infinity
  //          - Plus1 if positive unless a small number was supposed to be subtracted from a result with guard and round bits of 0
  //          - subtract 1 if a small number was supposed to be subtracted from a negative result with guard and round bits of 0

  //  round to nearest max magnitude
  //      {Guard, Round, Sticky}
  //      0x - do nothing
  //      10 - tie - Plus1
  //          - don't add 1 if a small number was supposed to be subtracted
  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
  //         - Plus 1 otherwise


  // determine what format the final result is in: int or fp
  assign IntRes = ToInt;
  assign FpRes = ~IntRes;

  // sticky bit calculation
  if (P.FPSIZES == 1) begin

      //     1: XLEN > NF
      //      |         XLEN          |
      //      |    NF     |1|1|
      //                     ^    ^ if floating point result
      //                     ^ if not an FMA result
      if (XLENPOS == 1)assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes) |
                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:0]);
      //     2: NF > XLEN
      if (XLENPOS == 2)assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF-1]&IntRes) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);

  end else if (P.FPSIZES == 2) begin
      // XLEN is either 64 or 32
      // so half and single are always smaller then XLEN

      // 1: XLEN > NF   > NF1
      if (XLENPOS == 1) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.NF-1]&FpRes&~OutFmt) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes) |
                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:0]);
      // 2: NF   > XLEN > NF1
      if (XLENPOS == 2) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&~OutFmt) | 
                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF-1]&(IntRes|~OutFmt)) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);
      // 3: NF   > NF1  > XLEN
      if (XLENPOS == 3) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF1-1]&IntRes) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.NF-1]&(~OutFmt|IntRes)) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);

  end else if (P.FPSIZES == 3) begin
      // 1: XLEN > NF   > NF1
      if (XLENPOS == 1) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF2-2:P.CORRSHIFTSZ-P.NF1-1]&FpRes&(OutFmt==P.FMT1)) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.NF-1]&FpRes&~(OutFmt==P.FMT)) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes) |
                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:0]);
      // 2: NF   > XLEN > NF1
      if (XLENPOS == 2) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF2-2:P.CORRSHIFTSZ-P.NF1-1]&FpRes&(OutFmt==P.FMT1)) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&~(OutFmt==P.FMT)) | 
                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF-1]&(IntRes|~(OutFmt==P.FMT))) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);
      // 3: NF   > NF1  > XLEN
      if (XLENPOS == 3) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF2-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&(OutFmt==P.FMT1)) |
                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF1-1]&((OutFmt==P.FMT1)|IntRes)) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.NF-1]&(~(OutFmt==P.FMT)|IntRes)) |
                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);

  end else if (P.FPSIZES == 4) begin
      // Quad precision will always be greater than XLEN
      // 2: NF   > XLEN > NF1
      if (XLENPOS == 2) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.H_NF-2:P.CORRSHIFTSZ-P.S_NF-1]&FpRes&(OutFmt==P.H_FMT)) |
                                                (|Mf[P.CORRSHIFTSZ-P.S_NF-2:P.CORRSHIFTSZ-P.D_NF-1]&FpRes&((OutFmt==P.S_FMT)|(OutFmt==P.H_FMT))) | 
                                                (|Mf[P.CORRSHIFTSZ-P.D_NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&~(OutFmt==P.Q_FMT)) | 
                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.Q_NF-1]&(~(OutFmt==P.Q_FMT)|IntRes)) |
                                                (|Mf[P.CORRSHIFTSZ-P.Q_NF-2:0]);
      // 3: NF   > NF1  > XLEN
      // The extra XLEN bit will be ored later when caculating the final sticky bit - the ufplus1 not needed for integer
      if (XLENPOS == 3) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.H_NF-2:P.CORRSHIFTSZ-P.S_NF-1]&FpRes&(OutFmt==P.H_FMT)) |
                                                (|Mf[P.CORRSHIFTSZ-P.S_NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&((OutFmt==P.S_FMT)|(OutFmt==P.H_FMT))) |
                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.D_NF-1]&((OutFmt==P.S_FMT)|(OutFmt==P.H_FMT)|IntRes)) |
                                                (|Mf[P.CORRSHIFTSZ-P.D_NF-2:P.CORRSHIFTSZ-P.Q_NF-1]&(~(OutFmt==P.Q_FMT)|IntRes)) |
                                                (|Mf[P.CORRSHIFTSZ-P.Q_NF-2:0]);

  end
  

  // only add the Addend sticky if doing an FMA opperation
  //      - the shifter shifts too far left when there's an underflow (shifting out all possible sticky bits)
  assign Sticky = FmaASticky&FmaOp | NormSticky | CvtResUf&CvtOp | FmaMe[P.NE+1]&FmaOp | DivSticky&DivOp;
  

  // determine round and LSB of the rounded value
  //      - underflow round bit is used to determint the underflow flag
  if (P.FPSIZES == 1) begin
      assign FpGuard = Mf[P.CORRSHIFTSZ-P.NF-1];
      assign FpLsbRes = Mf[P.CORRSHIFTSZ-P.NF];
      assign FpRound = Mf[P.CORRSHIFTSZ-P.NF-2];

  end else if (P.FPSIZES == 2) begin
      assign FpGuard = OutFmt ? Mf[P.CORRSHIFTSZ-P.NF-1] : Mf[P.CORRSHIFTSZ-P.NF1-1];
      assign FpLsbRes = OutFmt ? Mf[P.CORRSHIFTSZ-P.NF] : Mf[P.CORRSHIFTSZ-P.NF1];
      assign FpRound = OutFmt ? Mf[P.CORRSHIFTSZ-P.NF-2] : Mf[P.CORRSHIFTSZ-P.NF1-2];

  end else if (P.FPSIZES == 3) begin
      always_comb
          case (OutFmt)
              P.FMT: begin
                  FpGuard = Mf[P.CORRSHIFTSZ-P.NF-1];
                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.NF];
                  FpRound = Mf[P.CORRSHIFTSZ-P.NF-2];
              end
              P.FMT1: begin
                  FpGuard = Mf[P.CORRSHIFTSZ-P.NF1-1];
                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.NF1];
                  FpRound = Mf[P.CORRSHIFTSZ-P.NF1-2];
              end
              P.FMT2: begin
                  FpGuard = Mf[P.CORRSHIFTSZ-P.NF2-1];
                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.NF2];
                  FpRound = Mf[P.CORRSHIFTSZ-P.NF2-2];
              end
              default: begin
                  FpGuard = 1'bx;
                  FpLsbRes = 1'bx;
                  FpRound = 1'bx;
              end
          endcase
  end else if (P.FPSIZES == 4) begin
      always_comb
          case (OutFmt)
              2'h3: begin
                  FpGuard = Mf[P.CORRSHIFTSZ-P.Q_NF-1];
                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.Q_NF];
                  FpRound = Mf[P.CORRSHIFTSZ-P.Q_NF-2];
              end
              2'h1: begin
                  FpGuard = Mf[P.CORRSHIFTSZ-P.D_NF-1];
                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.D_NF];
                  FpRound = Mf[P.CORRSHIFTSZ-P.D_NF-2];
              end
              2'h0: begin
                  FpGuard = Mf[P.CORRSHIFTSZ-P.S_NF-1];
                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.S_NF];
                  FpRound = Mf[P.CORRSHIFTSZ-P.S_NF-2];
              end
              2'h2: begin
                  FpGuard = Mf[P.CORRSHIFTSZ-P.H_NF-1];
                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.H_NF];
                  FpRound = Mf[P.CORRSHIFTSZ-P.H_NF-2];
              end
          endcase
  end

  assign Guard = ToInt&CvtOp ? Mf[P.CORRSHIFTSZ-P.XLEN-1] : FpGuard;
  assign LsbRes = ToInt&CvtOp ? Mf[P.CORRSHIFTSZ-P.XLEN] : FpLsbRes;
  assign Round = ToInt&CvtOp ? Mf[P.CORRSHIFTSZ-P.XLEN-2] : FpRound;


  always_comb begin
      // Determine if you add 1
      case (Frm)
          3'b000: CalcPlus1 = Guard & (Round|Sticky|LsbRes);//round to nearest even
          3'b001: CalcPlus1 = 0;//round to zero
          3'b010: CalcPlus1 = Ms;//round down
          3'b011: CalcPlus1 = ~Ms;//round up
          3'b100: CalcPlus1 = Guard;//round to nearest max magnitude
          default: CalcPlus1 = 1'bx;
      endcase
      // Determine if you add 1 (for underflow flag)
      case (Frm)
          3'b000: UfCalcPlus1 = Round & (Sticky|Guard);//round to nearest even
          3'b001: UfCalcPlus1 = 0;//round to zero
          3'b010: UfCalcPlus1 = Ms;//round down
          3'b011: UfCalcPlus1 = ~Ms;//round up
          3'b100: UfCalcPlus1 = Round;//round to nearest max magnitude
          default: UfCalcPlus1 = 1'bx;
      endcase
  
  end

  // If an answer is exact don't round
  assign Plus1 = CalcPlus1 & (Sticky|Round|Guard);
  assign FpPlus1 = Plus1&~(ToInt&CvtOp);
  assign UfPlus1 = UfCalcPlus1 & (Sticky|Round);


  // place Plus1 into the proper position for the format
  if (P.FPSIZES == 1) begin
      assign RoundAdd = {{P.FLEN{1'b0}}, FpPlus1};

  end else if (P.FPSIZES == 2) begin
      // \/FLEN+1
      //  | NE+2 |        NF      |
      //  '-NE+2-^----NF1----^
      // P.FLEN+1-P.NE-2-P.NF1 = FLEN-1-NE-NF1
      assign RoundAdd = {(P.NE+1+P.NF1)'(0), FpPlus1&~OutFmt, (P.NF-P.NF1-1)'(0), FpPlus1&OutFmt};

  end else if (P.FPSIZES == 3) begin
      assign RoundAdd = {(P.NE+1+P.NF2)'(0), FpPlus1&(OutFmt==P.FMT2), (P.NF1-P.NF2-1)'(0), FpPlus1&(OutFmt==P.FMT1), (P.NF-P.NF1-1)'(0), FpPlus1&(OutFmt==P.FMT)};

  end else if (P.FPSIZES == 4)      
      assign RoundAdd = {(P.Q_NE+1+P.H_NF)'(0), FpPlus1&(OutFmt==P.H_FMT), (P.S_NF-P.H_NF-1)'(0), FpPlus1&(OutFmt==P.S_FMT), (P.D_NF-P.S_NF-1)'(0), FpPlus1&(OutFmt==P.D_FMT), (P.Q_NF-P.D_NF-1)'(0), FpPlus1&(OutFmt==P.Q_FMT)};


  // trim unneeded bits from fraction
  assign RoundFrac = Mf[P.CORRSHIFTSZ-1:P.CORRSHIFTSZ-P.NF];
  

  // select the exponent
  always_comb
      case(PostProcSel)
          2'b10: Me = FmaMe; // fma
          2'b00: Me = {CvtCe[P.NE], CvtCe}&{P.NE+2{~CvtResSubnormUf|CvtResUf}}; // cvt
          // 2'b01: Me = DivDone ? Qe : '0; // divide
          2'b01: Me = Qe; // divide
          default: Me = '0; 
      endcase


  // round the result
  //      - if the fraction overflows one should be added to the exponent
  assign {FullRe, Rf} = {Me, RoundFrac} + RoundAdd;
  assign Re = FullRe[P.NE-1:0];


endmodule