began divremsqrt specific postprocessing

2025-02-11 06:05:49 +00:00 · 2023-05-19 14:20:22 -07:00 · 2023-05-19 14:20:22 -07:00 · e17cfe9622
commit e17cfe9622
parent c9ceda5794
5 changed files with 813 additions and 0 deletions
--- a/src/fpu/divremsqrt/divremsqrt.sv
+++ b/src/fpu/divremsqrt/divremsqrt.sv
@ -0,0 +1,104 @@
 ///////////////////////////////////////////
 // divremsqrt.sv
 //
 // Written: kekim@hmc.edu
 // Modified:19 May 2023
 //
 // Purpose: Combined Divide and Square Root Floating Point and Integer Unit with postprocessing
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
 //
 // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
 //
 // Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
 // https://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 // either express or implied. See the License for the specific language governing permissions 
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 `include "wally-config.vh"
 module fdivsqrt(
  input  logic                clk, 
  input  logic                reset, 
  input  logic [`FMTBITS-1:0] FmtE,
  input  logic                XsE,
  input  logic [`NF:0]        XmE, YmE,
  input  logic [`NE-1:0]      XeE, YeE,
  input  logic                XInfE, YInfE, 
  input  logic                XZeroE, YZeroE, 
  input  logic                XNaNE, YNaNE, 
  input  logic                FDivStartE, IDivStartE,
  input  logic                StallM,
  input  logic                FlushE,
  input  logic                SqrtE, SqrtM,
  input  logic [`XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // these are the src outputs before the mux choosing between them and PCE to put in srcA/B
  input  logic [2:0]          Funct3E, Funct3M,
  input  logic                IntDivE, W64E,
  output logic                DivStickyM,
  output logic                FDivBusyE, IFDivStartE, FDivDoneE,
  output logic [`NE+1:0]      QeM,
  output logic [`DIVb:0]      QmM,
  output logic [`XLEN-1:0]    FIntDivResultM
 );
  // Floating-point division and square root module, with optional integer division and remainder
  // Computes X/Y, sqrt(X), A/B, or A%B
  logic [`DIVb+3:0]           WS, WC;                       // Partial remainder components
  logic [`DIVb+3:0]           X;                            // Iterator Initial Value (from dividend)
  logic [`DIVb+3:0]           D;                            // Iterator Divisor
  logic [`DIVb:0]             FirstU, FirstUM;              // Intermediate result values
  logic [`DIVb+1:0]           FirstC;                       // Step tracker
  logic                       Firstun;                      // Quotient selection
  logic                       WZeroE;                       // Early termination flag
  logic [`DURLEN-1:0]         CyclesE;                      // FSM cycles
  logic                       SpecialCaseM;                 // Divide by zero, square root of negative, etc.
  logic                       DivStartE;                    // Enable signal for flops during stall
  // Integer div/rem signals                                
  logic                       BZeroM;                       // Denominator is zero
  logic                       IntDivM;                      // Integer operation
  logic [`DIVBLEN:0]          nM, mM;                       // Shift amounts
  logic                       NegQuotM, ALTBM, AsM, W64M;   // Special handling for postprocessor
  logic [`XLEN-1:0]           AM;                           // Original Numerator for postprocessor
  logic                       ISpecialCaseE;                // Integer div/remainder special cases
  fdivsqrtpreproc fdivsqrtpreproc(                          // Preprocessor
    .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE),
    .FmtE, .SqrtE, .XZeroE, .Funct3E, .QeM, .X, .D, .CyclesE,
    // Int-specific 
    .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
    .BZeroM, .nM, .mM, .AM, 
    .IntDivM, .W64M, .NegQuotM, .ALTBM, .AsM);
  fdivsqrtfsm fdivsqrtfsm(                                  // FSM
    .clk, .reset, .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, 
    .FDivStartE, .XsE, .SqrtE, .WZeroE, .FlushE, .StallM, 
    .FDivBusyE, .IFDivStartE, .FDivDoneE, .SpecialCaseM, .CyclesE,
    // Int-specific 
    .IDivStartE, .ISpecialCaseE, .IntDivE);
  fdivsqrtiter fdivsqrtiter(                                // CSA Iterator
    .clk, .IFDivStartE, .FDivBusyE, .SqrtE, .X, .D, 
    .FirstU, .FirstUM, .FirstC, .Firstun, .FirstWS(WS), .FirstWC(WC));
  fdivsqrtpostproc fdivsqrtpostproc(                        // Postprocessor
    .clk, .reset, .StallM, .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, 
    .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
    .QmM, .WZeroE, .DivStickyM, 
    // Int-specific 
    .nM, .mM, .ALTBM, .AsM, .BZeroM, .NegQuotM, .W64M, .RemOpM(Funct3M[1]), .AM, 
    .FIntDivResultM);
 endmodule
--- a/src/fpu/divremsqrt/divremsqrtpostprocess.sv
+++ b/src/fpu/divremsqrt/divremsqrtpostprocess.sv
@ -0,0 +1,231 @@
 ///////////////////////////////////////////
 // postprocess.sv
 //
 // Written: kekim@hmc.edu
 // Modified: 19 May 2023
 //
 // Purpose: Post-Processing: normalization, rounding, sign, flags, special cases
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
 //
 // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
 //
 // Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
 // https://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 // either express or implied. See the License for the specific language governing permissions 
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 `include "wally-config.vh"
 module divremsqrtpostprocess (
  // general signals
  input logic                             Xs, Ys,     // input signs
  input logic  [`NF:0]                    Xm, Ym, Zm, // input mantissas
  input logic  [2:0]                      Frm,        // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
  input logic  [`FMTBITS-1:0]             Fmt,        // precision 1 = double 0 = single
  input logic  [2:0]                      OpCtrl,     // choose which opperation (look below for values)
  input logic                             XZero, YZero,        // inputs are zero
  input logic                             XInf, YInf, ZInf,    // inputs are infinity
  input logic                             XNaN, YNaN, ZNaN,    // inputs are NaN
  input logic                             XSNaN, YSNaN, ZSNaN, // inputs are signaling NaNs
  input logic  [1:0]                      PostProcSel,         // select result to be written to fp register
  //fma signals
  input logic                             FmaAs,      // the modified Z sign - depends on instruction
  input logic                             FmaPs,      // the product's sign
  input logic                             FmaSs,      // Sum sign
  input logic  [`NE+1:0]                  FmaSe,      // the sum's exponent
  input logic  [3*`NF+3:0]                FmaSm,      // the positive sum
  input logic                             FmaASticky, // sticky bit that is calculated during alignment
  input logic  [$clog2(3*`NF+5)-1:0]      FmaSCnt,    // the normalization shift count
  //divide signals
  input logic                             DivSticky,  // divider sticky bit
  input logic  [`NE+1:0]                  DivQe,      // divsqrt exponent
  input logic  [`DIVb:0]                  DivQm,      // divsqrt significand
  // conversion signals
  input logic                             CvtCs,      // the result's sign
  input logic  [`NE:0]                    CvtCe,      // the calculated expoent
  input logic                             CvtResSubnormUf, // the convert result is subnormal or underflows
  input logic  [`LOGCVTLEN-1:0]           CvtShiftAmt,// how much to shift by
  input logic                             ToInt,      // is fp->int (since it's writting to the integer register)
  input logic  [`CVTLEN-1:0]              CvtLzcIn,   // input to the Leading Zero Counter (without msb)
  input logic                             IntZero,    // is the integer input zero
  // final results
  output logic [`FLEN-1:0]                PostProcRes,// postprocessor final result
  output logic [4:0]                      PostProcFlg,// postprocesser flags
  output logic [`XLEN-1:0]                FCvtIntRes  // the integer conversion result
  );
  // general signals
  logic                       Rs;         // result sign
  logic [`NF-1:0]             Rf;         // Result fraction
  logic [`NE-1:0]             Re;         // Result exponent
  logic                       Ms;         // norMalized sign
  logic [`CORRSHIFTSZ-1:0]    Mf;         // norMalized fraction
  logic [`NE+1:0]             Me;         // normalized exponent
  logic [`NE+1:0]             FullRe;     // Re with bits to determine sign and overflow
  logic                       UfPlus1;    // do you add one (for determining underflow flag)
  logic [`LOGNORMSHIFTSZ-1:0] ShiftAmt;   // normalization shift amount
  logic [`NORMSHIFTSZ-1:0]    ShiftIn;    // input to normalization shift
  logic [`NORMSHIFTSZ-1:0]    Shifted;    // the ouput of the normalized shifter (before shift correction)
  logic                       Plus1;      // add one to the final result?
  logic                       Overflow;   // overflow flag used to select results
  logic                       Invalid;    // invalid flag used to select results
  logic                       Guard, Round, Sticky; // bits needed to determine rounding
  logic [`FMTBITS-1:0]        OutFmt;     // output format
  // fma signals
  logic [`NE+1:0]             FmaMe;      // exponent of the normalized sum
  logic                       FmaSZero;   // is the sum zero
  logic [3*`NF+5:0]           FmaShiftIn; // fma shift input
  logic [`NE+1:0]             NormSumExp; // exponent of the normalized sum not taking into account Subnormal or zero results
  logic                       FmaPreResultSubnorm; // is the result subnormal - calculated before LZA corection
  logic [$clog2(3*`NF+5)-1:0] FmaShiftAmt;// normalization shift amount for fma
  // division singals
  logic [`LOGNORMSHIFTSZ-1:0] DivShiftAmt;        // divsqrt shif amount
  logic [`NORMSHIFTSZ-1:0]    DivShiftIn;         // divsqrt shift input
  logic [`NE+1:0]             Qe;                 // divsqrt corrected exponent after corretion shift
  logic                       DivByZero;          // divide by zero flag
  logic                       DivResSubnorm;      // is the divsqrt result subnormal
  logic                       DivSubnormShiftPos; // is the divsqrt subnorm shift amout positive (not underflowed)
  // conversion signals
  logic [`CVTLEN+`NF:0]       CvtShiftIn;         // number to be shifted for converter
  logic [1:0]                 CvtNegResMsbs;      // most significant bits of possibly negated int result
  logic [`XLEN+1:0]           CvtNegRes;          // possibly negated integer result
  logic                       CvtResUf;           // did the convert result underflow
  logic                       IntInvalid;         // invalid integer flag
  // readability signals
  logic                       Mult;       // multiply opperation
  logic                       Sqrt;       // is the divsqrt opperation sqrt
  logic                       Int64;      // is the integer 64 bits?
  logic                       Signed;     // is the opperation with a signed integer?
  logic                       IntToFp;    // is the opperation an int->fp conversion?
  logic                       CvtOp;      // convertion opperation
  logic                       FmaOp;      // fma opperation
  logic                       DivOp;      // divider opperation
  logic                       InfIn;      // are any of the inputs infinity
  logic                       NaNIn;      // are any of the inputs NaN
  // signals to help readability
  //assign Signed =  OpCtrl[0];
  //assign Int64 =   OpCtrl[1];
  //assign IntToFp = OpCtrl[2];
  //assign Mult = OpCtrl[2]&~OpCtrl[1]&~OpCtrl[0];
  //assign CvtOp = (PostProcSel == 2'b00);
  //assign FmaOp = (PostProcSel == 2'b10);
  assign DivOp = (PostProcSel == 2'b01);
  assign Sqrt =  OpCtrl[0];
  // is there an input of infinity or NaN being used
  assign InfIn = XInf|YInf|ZInf;
  assign NaNIn = XNaN|YNaN|ZNaN;
  // choose the ouptut format depending on the opperation
  //      - fp -> fp: OpCtrl contains the percision of the output
  //      - otherwise: Fmt contains the percision of the output
  if (`FPSIZES == 2) 
      //assign OutFmt = IntToFp|~CvtOp ? Fmt : (OpCtrl[1:0] == `FMT); 
      assign OutFmt = Fmt;
  else if (`FPSIZES == 3 | `FPSIZES == 4) 
      //assign OutFmt = IntToFp|~CvtOp ? Fmt : OpCtrl[1:0]; 
      assign OutFmt = Fmt;
  ///////////////////////////////////////////////////////////////////////////////
  // Normalization
  ///////////////////////////////////////////////////////////////////////////////
  // final claulations before shifting
  /*cvtshiftcalc cvtshiftcalc(.ToInt, .CvtCe, .CvtResSubnormUf, .Xm, .CvtLzcIn,  
      .XZero, .IntToFp, .OutFmt, .CvtResUf, .CvtShiftIn);*/
  /*fmashiftcalc fmashiftcalc(.FmaSm, .FmaSCnt, .Fmt, .NormSumExp, .FmaSe,
      .FmaSZero, .FmaPreResultSubnorm, .FmaShiftAmt, .FmaShiftIn);*/
  divshiftcalc divshiftcalc(.DivQe, .DivQm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
  assign ShiftAmt = DivShiftAmt;
  assign ShiftIn = DivShiftIn;
  /*
  // select which unit's output to shift
  always_comb
    case(PostProcSel)
      2'b10: begin // fma
        ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(3*`NF+5){1'b0}}, FmaShiftAmt};
        ShiftIn =  {FmaShiftIn, {`NORMSHIFTSZ-(3*`NF+6){1'b0}}};
      end
      2'b00: begin // cvt
        ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(`CVTLEN+1){1'b0}}, CvtShiftAmt};
        ShiftIn =  {CvtShiftIn, {`NORMSHIFTSZ-`CVTLEN-`NF-1{1'b0}}};
      end
      2'b01: begin //divsqrt
        ShiftAmt = DivShiftAmt;
        ShiftIn =  DivShiftIn;
      end
      default: begin 
        ShiftAmt = {`LOGNORMSHIFTSZ{1'bx}}; 
        ShiftIn = {`NORMSHIFTSZ{1'bx}}; 
      end
    endcase
  */
  // main normalization shift
  normshift normshift (.ShiftIn, .ShiftAmt, .Shifted);
  // correct for LZA/divsqrt error
  divremsqrtshiftcorrection shiftcorrection(.DivResSubnorm, .DivSubnormShiftPos, .DivOp, .DivQe, .Qe, .Shifted, .Mf);
  ///////////////////////////////////////////////////////////////////////////////
  // Rounding
  ///////////////////////////////////////////////////////////////////////////////
  // round to nearest even
  // round to zero
  // round to -infinity
  // round to infinity
  // round to nearest max magnitude
  // calulate result sign used in rounding unit
  divremsqrtroundsign roundsign(.FmaOp, .DivOp, .CvtOp, .Sqrt, .FmaSs, .Xs, .Ys, .CvtCs, .Ms);
  round round(.OutFmt, .Frm, .FmaASticky, .Plus1, .PostProcSel, .CvtCe, .Qe,
      .Ms, .FmaMe, .FmaOp, .CvtOp, .CvtResSubnormUf, .Mf, .ToInt,  .CvtResUf,
      .DivSticky, .DivOp, .UfPlus1, .FullRe, .Rf, .Re, .Sticky, .Round, .Guard, .Me);
  ///////////////////////////////////////////////////////////////////////////////
  // Sign calculation
  ///////////////////////////////////////////////////////////////////////////////
  /*resultsign resultsign(.Frm, .FmaPs, .FmaAs, .Round, .Sticky, .Guard,
      .FmaOp, .ZInf, .InfIn, .FmaSZero, .Mult, .Ms, .Rs);*/
  ///////////////////////////////////////////////////////////////////////////////
  // Flags
  ///////////////////////////////////////////////////////////////////////////////
  flags flags(.XSNaN, .YSNaN, .ZSNaN, .XInf, .YInf, .ZInf, .InfIn, .XZero, .YZero, 
              .Xs, .Sqrt, .ToInt, .IntToFp, .Int64, .Signed, .OutFmt, .CvtCe,
              .NaNIn, .FmaAs, .FmaPs, .Round, .IntInvalid, .DivByZero,
              .Guard, .Sticky, .UfPlus1, .CvtOp, .DivOp, .FmaOp, .FullRe, .Plus1,
              .Me, .CvtNegResMsbs, .Invalid, .Overflow, .PostProcFlg);
  ///////////////////////////////////////////////////////////////////////////////
  // Select the result
  ///////////////////////////////////////////////////////////////////////////////
  negateintres negateintres(.Xs, .Shifted, .Signed, .Int64, .Plus1, .CvtNegResMsbs, .CvtNegRes);
  specialcase specialcase(.Xs, .Xm, .Ym, .Zm, .XZero, .IntInvalid,
      .IntZero, .Frm, .OutFmt, .XNaN, .YNaN, .ZNaN, .CvtResUf, 
      .NaNIn, .IntToFp, .Int64, .Signed, .CvtOp, .FmaOp, .Plus1, .Invalid, .Overflow, .InfIn, .CvtNegRes,
      .XInf, .YInf, .DivOp, .DivByZero, .FullRe, .CvtCe, .Rs, .Re, .Rf, .PostProcRes, .FCvtIntRes);
 endmodule
--- a/src/fpu/divremsqrt/divremsqrtround.sv
+++ b/src/fpu/divremsqrt/divremsqrtround.sv
@ -0,0 +1,339 @@
 ///////////////////////////////////////////
 // divremsqrtround.sv
 //
 // Written: kekim@hmc.edu
 // Modified: 19 May 2023
 //
 // Purpose: Rounder
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
 //
 // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
 //
 // Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
 // https://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 // either express or implied. See the License for the specific language governing permissions 
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 `include "wally-config.vh"
 // what position is XLEN in?
 //  options: 
 //     1: XLEN > NF   > NF1
 //     2: NF   > XLEN > NF1
 //     3: NF   > NF1  > XLEN
 //  single and double will always be smaller than XLEN
 `define XLENPOS ((`XLEN>`NF) ? 1 : (`XLEN>`NF1) ? 2 : 3)
 module round(
  input  logic [`FMTBITS-1:0]     OutFmt,             // output format
  input  logic [2:0]              Frm,                // rounding mode
  //input  logic [1:0]              PostProcSel,        // select the postprocessor output
  input  logic                    Ms,                 // normalized sign
  input  logic [`CORRSHIFTSZ-1:0] Mf,                 // normalized fraction
  // fma
  //input  logic                    FmaOp,              // is an fma opperation being done?
  //input  logic [`NE+1:0]          FmaMe,              // exponent of the normalized sum for fma
  //input  logic                    FmaASticky,         // addend's sticky bit
  // divsqrt
  //input  logic                    DivOp,              // is a division opperation being done
  input  logic                    DivSticky,          // divsqrt sticky bit
  input  logic [`NE+1:0]          Qe,                 // the divsqrt calculated expoent
  // cvt
  input  logic                    CvtOp,              // is a convert opperation being done
  input  logic                    ToInt,              // is the cvt op a cvt to integer
  input  logic                    CvtResSubnormUf,    // is the cvt result subnormal or underflow
  input  logic                    CvtResUf,           // does the cvt result underflow
  input  logic [`NE:0]            CvtCe,              // the cvt calculated expoent
  // outputs
  output logic [`NE+1:0]          Me,                 // normalied fraction
  output logic                    UfPlus1,            // do you add one to the result if given an unbounded exponent
  output logic [`NE+1:0]          FullRe,             // Re with bits to determine sign and overflow
  output logic [`NE-1:0]          Re,                 // Result exponent
  output logic [`NF-1:0]          Rf,                 // Result fractionNormS
  output logic                    Sticky,             // sticky bit
  output logic                    Plus1,              // do you add one to the final result
  output logic                    Round, Guard        // bits needed to calculate rounding
 );
  logic           UfCalcPlus1;        // calculated plus one for unbounded exponent
  logic           NormSticky;         // normalized sum's sticky bit
  logic [`NF-1:0] RoundFrac;          // rounded fraction
  logic           FpRes;              // is the result a floating point
  logic           IntRes;             // is the result an integer
  logic           FpGuard, FpRound;   // floating point round/guard bits
  logic           FpLsbRes;           // least significant bit of floating point result
  logic           LsbRes;             // lsb of result
  logic           CalcPlus1;          // calculated plus1
  logic           FpPlus1;            // do you add one to the fp result 
  logic [`FLEN:0] RoundAdd;           // how much to add to the result
  ///////////////////////////////////////////////////////////////////////////////
  // Rounding
  ///////////////////////////////////////////////////////////////////////////////
  // round to nearest even
  //      {Round, Sticky}
  //      0x - do nothing
  //      10 - tie - Plus1 if result is odd  (LSBNormSum = 1)
  //          - don't add 1 if a small number was supposed to be subtracted
  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
  //         - plus 1 otherwise
  //  round to zero - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
  //  round to -infinity
  //          - Plus1 if negative unless a small number was supposed to be subtracted from a result with guard and round bits of 0
  //          - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
  //  round to infinity
  //          - Plus1 if positive unless a small number was supposed to be subtracted from a result with guard and round bits of 0
  //          - subtract 1 if a small number was supposed to be subtracted from a negative result with guard and round bits of 0
  //  round to nearest max magnitude
  //      {Guard, Round, Sticky}
  //      0x - do nothing
  //      10 - tie - Plus1
  //          - don't add 1 if a small number was supposed to be subtracted
  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
  //         - Plus 1 otherwise
  // determine what format the final result is in: int or fp
  assign IntRes = ToInt;
  assign FpRes = ~IntRes;
  // sticky bit calculation
  if (`FPSIZES == 1) begin
      //     1: XLEN > NF
      //      |         XLEN          |
      //      |    NF     |1|1|
      //                     ^    ^ if floating point result
      //                     ^ if not an FMA result
      if (`XLENPOS == 1)assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:0]);
      //     2: NF > XLEN
      if (`XLENPOS == 2)assign NormSticky = (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&IntRes) |
                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
  end else if (`FPSIZES == 2) begin
      // XLEN is either 64 or 32
      // so half and single are always smaller then XLEN
      // 1: XLEN > NF   > NF1
      if (`XLENPOS == 1) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&FpRes&~OutFmt) |
                                                (|Mf[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:0]);
      // 2: NF   > XLEN > NF1
      if (`XLENPOS == 2) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~OutFmt) | 
                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&(IntRes|~OutFmt)) |
                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
      // 3: NF   > NF1  > XLEN
      if (`XLENPOS == 3) assign NormSticky = (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF1-1]&IntRes) |
                                                (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&(~OutFmt|IntRes)) |
                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
  end else if (`FPSIZES == 3) begin
      // 1: XLEN > NF   > NF1
      if (`XLENPOS == 1) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`NF1-1]&FpRes&(OutFmt==`FMT1)) |
                                                (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&FpRes&~(OutFmt==`FMT)) |
                                                (|Mf[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:0]);
      // 2: NF   > XLEN > NF1
      if (`XLENPOS == 2) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`NF1-1]&FpRes&(OutFmt==`FMT1)) |
                                                (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~(OutFmt==`FMT)) | 
                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&(IntRes|~(OutFmt==`FMT))) |
                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
      // 3: NF   > NF1  > XLEN
      if (`XLENPOS == 3) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&(OutFmt==`FMT1)) |
                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF1-1]&((OutFmt==`FMT1)|IntRes)) |
                                                (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&(~(OutFmt==`FMT)|IntRes)) |
                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
  end else if (`FPSIZES == 4) begin
      // Quad precision will always be greater than XLEN
      // 2: NF   > XLEN > NF1
      if (`XLENPOS == 2) assign NormSticky = (|Mf[`CORRSHIFTSZ-`H_NF-2:`CORRSHIFTSZ-`S_NF-1]&FpRes&(OutFmt==`H_FMT)) |
                                                (|Mf[`CORRSHIFTSZ-`S_NF-2:`CORRSHIFTSZ-`D_NF-1]&FpRes&((OutFmt==`S_FMT)|(OutFmt==`H_FMT))) | 
                                                (|Mf[`CORRSHIFTSZ-`D_NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~(OutFmt==`Q_FMT)) | 
                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`Q_NF-1]&(~(OutFmt==`Q_FMT)|IntRes)) |
                                                (|Mf[`CORRSHIFTSZ-`Q_NF-2:0]);
      // 3: NF   > NF1  > XLEN
      // The extra XLEN bit will be ored later when caculating the final sticky bit - the ufplus1 not needed for integer
      if (`XLENPOS == 3) assign NormSticky = (|Mf[`CORRSHIFTSZ-`H_NF-2:`CORRSHIFTSZ-`S_NF-1]&FpRes&(OutFmt==`H_FMT)) |
                                                (|Mf[`CORRSHIFTSZ-`S_NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&((OutFmt==`S_FMT)|(OutFmt==`H_FMT))) |
                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`D_NF-1]&((OutFmt==`S_FMT)|(OutFmt==`H_FMT)|IntRes)) |
                                                (|Mf[`CORRSHIFTSZ-`D_NF-2:`CORRSHIFTSZ-`Q_NF-1]&(~(OutFmt==`Q_FMT)|IntRes)) |
                                                (|Mf[`CORRSHIFTSZ-`Q_NF-2:0]);
  end
  // only add the Addend sticky if doing an FMA opperation
  //      - the shifter shifts too far left when there's an underflow (shifting out all possible sticky bits)
  //assign Sticky = FmaASticky&FmaOp | NormSticky | CvtResUf&CvtOp | FmaMe[`NE+1]&FmaOp | DivSticky&DivOp;
  assign Sticky = DivSticky;
  // determine round and LSB of the rounded value
  //      - underflow round bit is used to determint the underflow flag
  if (`FPSIZES == 1) begin
      assign FpGuard = Mf[`CORRSHIFTSZ-`NF-1];
      assign FpLsbRes = Mf[`CORRSHIFTSZ-`NF];
      assign FpRound = Mf[`CORRSHIFTSZ-`NF-2];
  end else if (`FPSIZES == 2) begin
      assign FpGuard = OutFmt ? Mf[`CORRSHIFTSZ-`NF-1] : Mf[`CORRSHIFTSZ-`NF1-1];
      assign FpLsbRes = OutFmt ? Mf[`CORRSHIFTSZ-`NF] : Mf[`CORRSHIFTSZ-`NF1];
      assign FpRound = OutFmt ? Mf[`CORRSHIFTSZ-`NF-2] : Mf[`CORRSHIFTSZ-`NF1-2];
  end else if (`FPSIZES == 3) begin
      always_comb
          case (OutFmt)
              `FMT: begin
                  FpGuard = Mf[`CORRSHIFTSZ-`NF-1];
                  FpLsbRes = Mf[`CORRSHIFTSZ-`NF];
                  FpRound = Mf[`CORRSHIFTSZ-`NF-2];
              end
              `FMT1: begin
                  FpGuard = Mf[`CORRSHIFTSZ-`NF1-1];
                  FpLsbRes = Mf[`CORRSHIFTSZ-`NF1];
                  FpRound = Mf[`CORRSHIFTSZ-`NF1-2];
              end
              `FMT2: begin
                  FpGuard = Mf[`CORRSHIFTSZ-`NF2-1];
                  FpLsbRes = Mf[`CORRSHIFTSZ-`NF2];
                  FpRound = Mf[`CORRSHIFTSZ-`NF2-2];
              end
              default: begin
                  FpGuard = 1'bx;
                  FpLsbRes = 1'bx;
                  FpRound = 1'bx;
              end
          endcase
  end else if (`FPSIZES == 4) begin
      always_comb
          case (OutFmt)
              2'h3: begin
                  FpGuard = Mf[`CORRSHIFTSZ-`Q_NF-1];
                  FpLsbRes = Mf[`CORRSHIFTSZ-`Q_NF];
                  FpRound = Mf[`CORRSHIFTSZ-`Q_NF-2];
              end
              2'h1: begin
                  FpGuard = Mf[`CORRSHIFTSZ-`D_NF-1];
                  FpLsbRes = Mf[`CORRSHIFTSZ-`D_NF];
                  FpRound = Mf[`CORRSHIFTSZ-`D_NF-2];
              end
              2'h0: begin
                  FpGuard = Mf[`CORRSHIFTSZ-`S_NF-1];
                  FpLsbRes = Mf[`CORRSHIFTSZ-`S_NF];
                  FpRound = Mf[`CORRSHIFTSZ-`S_NF-2];
              end
              2'h2: begin
                  FpGuard = Mf[`CORRSHIFTSZ-`H_NF-1];
                  FpLsbRes = Mf[`CORRSHIFTSZ-`H_NF];
                  FpRound = Mf[`CORRSHIFTSZ-`H_NF-2];
              end
          endcase
  end
  /*assign Guard = ToInt&CvtOp ? Mf[`CORRSHIFTSZ-`XLEN-1] : FpGuard;
  assign LsbRes = ToInt&CvtOp ? Mf[`CORRSHIFTSZ-`XLEN] : FpLsbRes;
  assign Round = ToInt&CvtOp ? Mf[`CORRSHIFTSZ-`XLEN-2] : FpRound;*/
  assign Guard =  FpGuard;
  assign LsbRes = FpLsbRes;
  assign Round =  FpRound;
  always_comb begin
      // Determine if you add 1
      case (Frm)
          3'b000: CalcPlus1 = Guard & (Round|Sticky|LsbRes);//round to nearest even
          3'b001: CalcPlus1 = 0;//round to zero
          3'b010: CalcPlus1 = Ms;//round down
          3'b011: CalcPlus1 = ~Ms;//round up
          3'b100: CalcPlus1 = Guard;//round to nearest max magnitude
          default: CalcPlus1 = 1'bx;
      endcase
      // Determine if you add 1 (for underflow flag)
      case (Frm)
          3'b000: UfCalcPlus1 = Round & (Sticky|Guard);//round to nearest even
          3'b001: UfCalcPlus1 = 0;//round to zero
          3'b010: UfCalcPlus1 = Ms;//round down
          3'b011: UfCalcPlus1 = ~Ms;//round up
          3'b100: UfCalcPlus1 = Round;//round to nearest max magnitude
          default: UfCalcPlus1 = 1'bx;
      endcase
  end
  // If an answer is exact don't round
  assign Plus1 = CalcPlus1 & (Sticky|Round|Guard);
  //assign FpPlus1 = Plus1&~(ToInt&CvtOp);
  assign FpPlus1 = Plus1;
  assign UfPlus1 = UfCalcPlus1 & (Sticky|Round);
  // place Plus1 into the proper position for the format
  if (`FPSIZES == 1) begin
      assign RoundAdd = {{`FLEN{1'b0}}, FpPlus1};
  end else if (`FPSIZES == 2) begin
      // \/FLEN+1
      //  | NE+2 |        NF      |
      //  '-NE+2-^----NF1----^
      // `FLEN+1-`NE-2-`NF1 = FLEN-1-NE-NF1
      assign RoundAdd = {(`NE+1+`NF1)'(0), FpPlus1&~OutFmt, (`NF-`NF1-1)'(0), FpPlus1&OutFmt};
  end else if (`FPSIZES == 3) begin
      assign RoundAdd = {(`NE+1+`NF2)'(0), FpPlus1&(OutFmt==`FMT2), (`NF1-`NF2-1)'(0), FpPlus1&(OutFmt==`FMT1), (`NF-`NF1-1)'(0), FpPlus1&(OutFmt==`FMT)};
  end else if (`FPSIZES == 4)      
      assign RoundAdd = {(`Q_NE+1+`H_NF)'(0), FpPlus1&(OutFmt==`H_FMT), (`S_NF-`H_NF-1)'(0), FpPlus1&(OutFmt==`S_FMT), (`D_NF-`S_NF-1)'(0), FpPlus1&(OutFmt==`D_FMT), (`Q_NF-`D_NF-1)'(0), FpPlus1&(OutFmt==`Q_FMT)};
  // trim unneeded bits from fraction
  assign RoundFrac = Mf[`CORRSHIFTSZ-1:`CORRSHIFTSZ-`NF];
  // select the exponent
  assign Me = Qe;
  /*always_comb
      case(PostProcSel)
          2'b10: Me = FmaMe; // fma
          2'b00: Me = {CvtCe[`NE], CvtCe}&{`NE+2{~CvtResSubnormUf|CvtResUf}}; // cvt
          // 2'b01: Me = DivDone ? Qe : '0; // divide
          2'b01: Me = Qe; // divide
          default: Me = '0; 
      endcase*/
  // round the result
  //      - if the fraction overflows one should be added to the exponent
  assign {FullRe, Rf} = {Me, RoundFrac} + RoundAdd;
  assign Re = FullRe[`NE-1:0];
 endmodule
--- a/src/fpu/divremsqrt/divremsqrtroundsign.sv
+++ b/src/fpu/divremsqrt/divremsqrtroundsign.sv
@ -0,0 +1,46 @@
 ///////////////////////////////////////////
 // divremsqrtroundsign.sv
 //
 // Written: kekim@hmc.edu,me@KatherineParry.com
 // Modified: 19 May 2023
 //
 // Purpose: Sign calculation for rounding
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
 //
 // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
 //
 // Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
 // https://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 // either express or implied. See the License for the specific language governing permissions 
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 `include "wally-config.vh"
 module roundsign(
  input logic         Xs,     // x sign
  input logic         Ys,     // y sign
  input logic         Sqrt,   // sqrt oppertion? (when using divsqrt unit)
  input logic         DivOp,  // is divsqrt opperation
  output logic        Ms      // normalized result sign
 );
  logic               Qs;     // divsqrt result sign
  // calculate divsqrt sign
  assign Qs = Xs^(Ys&~Sqrt);
  // Select sign for rounding calulation
  assign Ms = (Qs&DivOp);
 endmodule
--- a/src/fpu/divremsqrt/divremsqrtshiftcorrection.sv
+++ b/src/fpu/divremsqrt/divremsqrtshiftcorrection.sv
@ -0,0 +1,93 @@
 ///////////////////////////////////////////
 // divremsqrtshiftcorrection.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
 //
 // Purpose: shift correction
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
 //
 // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
 //
 // Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
 // https://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 // either express or implied. See the License for the specific language governing permissions 
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 `include "wally-config.vh"
 module divremsqrtshiftcorrection(
  input logic  [`NORMSHIFTSZ-1:0] Shifted,                // the shifted sum before LZA correction
  // divsqrt
  input logic                     DivOp,                  // is it a divsqrt opperation
  input logic                     DivResSubnorm,          // is the divsqrt result subnormal
  input logic  [`NE+1:0]          DivQe,                  // the divsqrt result's exponent
  input logic                     DivSubnormShiftPos,     // is the subnorm divider shift amount positive (ie not underflowed)
  //fma
  //input logic                     FmaOp,                  // is it an fma opperation
  //input logic  [`NE+1:0]          NormSumExp,             // exponent of the normalized sum not taking into account Subnormal or zero results
  //input logic                     FmaPreResultSubnorm,    // is the result subnormal - calculated before LZA corection
  //input logic                     FmaSZero,
  // output
  //output logic [`NE+1:0]          FmaMe,                  // exponent of the normalized sum
  output logic [`CORRSHIFTSZ-1:0] Mf,                     // the shifted sum before LZA correction
  output logic [`NE+1:0]          Qe                      // corrected exponent for divider
 );
  logic [3*`NF+3:0]           CorrSumShifted;             // the shifted sum after LZA correction
  logic [`CORRSHIFTSZ-1:0]    CorrQm0, CorrQm1;           // portions of Shifted to select for CorrQmShifted
  logic [`CORRSHIFTSZ-1:0]    CorrQmShifted;              // the shifted divsqrt result after one bit shift
  logic                       ResSubnorm;                 // is the result Subnormal
  logic                       LZAPlus1;                   // add one or two to the sum's exponent due to LZA correction
  logic                       LeftShiftQm;                // should the divsqrt result be shifted one to the left
  // LZA correction
  assign LZAPlus1 = Shifted[`NORMSHIFTSZ-1];
  // correct the shifting error caused by the LZA
  //  - the only possible mantissa for a plus two is all zeroes 
  //      - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
  mux2 #(`NORMSHIFTSZ-2) lzacorrmux(Shifted[`NORMSHIFTSZ-3:0], Shifted[`NORMSHIFTSZ-2:1], LZAPlus1, CorrSumShifted);
  // correct the shifting of the divsqrt caused by producing a result in (2, .5] range
  //    condition: if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Subnorm)
  assign LeftShiftQm = (LZAPlus1|(DivQe==1&~LZAPlus1));
  assign CorrQm0 = Shifted[`NORMSHIFTSZ-3:`NORMSHIFTSZ-`CORRSHIFTSZ-2];
  assign CorrQm1 = Shifted[`NORMSHIFTSZ-2:`NORMSHIFTSZ-`CORRSHIFTSZ-1];
  mux2 #(`CORRSHIFTSZ) divcorrmux(CorrQm0, CorrQm1, LeftShiftQm, CorrQmShifted);
  // if the result of the divider was calculated to be subnormal, then the result was correctly normalized, so select the top shifted bits
  always_comb
    //if(FmaOp)                       Mf = {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+4){1'b0}}};
    if (DivOp&~DivResSubnorm)  Mf = CorrQmShifted;
    else                       Mf = Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ];
  // Determine sum's exponent
  //  main exponent issues: 
  //      - LZA was one too large
  //      - LZA was two too large
  //      - if the result was calulated to be subnorm but it's norm and the LZA was off by 1
  //      - if the result was calulated to be subnorm but it's norm and the LZA was off by 2
  //                          if plus1                    If plus2                               kill if the result Zero or actually subnormal
  //                          |                           |                                      |
  //assign FmaMe = (NormSumExp+{{`NE+1{1'b0}}, LZAPlus1} +{{`NE+1{1'b0}}, FmaPreResultSubnorm}) & {`NE+2{~(FmaSZero|ResSubnorm)}};
  // recalculate if the result is subnormal after LZA correction
  //assign ResSubnorm = FmaPreResultSubnorm&~Shifted[`NORMSHIFTSZ-2]&~Shifted[`NORMSHIFTSZ-1];
  // the quotent is in the range [.5,2) if there is no early termination
  // if the quotent < 1 and not Subnormal then subtract 1 to account for the normalization shift
  assign Qe = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivQe - {(`NE+1)'(0), ~LZAPlus1};
 endmodule