began divremsqrt specific postprocessing

2025-02-11 06:05:49 +00:00 · 2023-05-19 14:20:22 -07:00 · 2023-05-19 14:20:22 -07:00 · e17cfe9622
commit e17cfe9622
parent c9ceda5794
5 changed files with 813 additions and 0 deletions
--- a/src/fpu/divremsqrt/divremsqrt.sv
+++ b/src/fpu/divremsqrt/divremsqrt.sv
@ -0,0 +1,104 @@
+///////////////////////////////////////////
+// divremsqrt.sv
+//
+// Written: kekim@hmc.edu
+// Modified:19 May 2023
+//
+// Purpose: Combined Divide and Square Root Floating Point and Integer Unit with postprocessing
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fdivsqrt(
+  input  logic                clk, 
+  input  logic                reset, 
+  input  logic [`FMTBITS-1:0] FmtE,
+  input  logic                XsE,
+  input  logic [`NF:0]        XmE, YmE,
+  input  logic [`NE-1:0]      XeE, YeE,
+  input  logic                XInfE, YInfE, 
+  input  logic                XZeroE, YZeroE, 
+  input  logic                XNaNE, YNaNE, 
+  input  logic                FDivStartE, IDivStartE,
+  input  logic                StallM,
+  input  logic                FlushE,
+  input  logic                SqrtE, SqrtM,
+  input  logic [`XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // these are the src outputs before the mux choosing between them and PCE to put in srcA/B
+  input  logic [2:0]          Funct3E, Funct3M,
+  input  logic                IntDivE, W64E,
+  output logic                DivStickyM,
+  output logic                FDivBusyE, IFDivStartE, FDivDoneE,
+  output logic [`NE+1:0]      QeM,
+  output logic [`DIVb:0]      QmM,
+  output logic [`XLEN-1:0]    FIntDivResultM
+);
+
+  // Floating-point division and square root module, with optional integer division and remainder
+  // Computes X/Y, sqrt(X), A/B, or A%B
+
+  logic [`DIVb+3:0]           WS, WC;                       // Partial remainder components
+  logic [`DIVb+3:0]           X;                            // Iterator Initial Value (from dividend)
+  logic [`DIVb+3:0]           D;                            // Iterator Divisor
+  logic [`DIVb:0]             FirstU, FirstUM;              // Intermediate result values
+  logic [`DIVb+1:0]           FirstC;                       // Step tracker
+  logic                       Firstun;                      // Quotient selection
+  logic                       WZeroE;                       // Early termination flag
+  logic [`DURLEN-1:0]         CyclesE;                      // FSM cycles
+  logic                       SpecialCaseM;                 // Divide by zero, square root of negative, etc.
+  logic                       DivStartE;                    // Enable signal for flops during stall
+                                                            
+  // Integer div/rem signals                                
+  logic                       BZeroM;                       // Denominator is zero
+  logic                       IntDivM;                      // Integer operation
+  logic [`DIVBLEN:0]          nM, mM;                       // Shift amounts
+  logic                       NegQuotM, ALTBM, AsM, W64M;   // Special handling for postprocessor
+  logic [`XLEN-1:0]           AM;                           // Original Numerator for postprocessor
+  logic                       ISpecialCaseE;                // Integer div/remainder special cases
+
+  fdivsqrtpreproc fdivsqrtpreproc(                          // Preprocessor
+    .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE),
+    .FmtE, .SqrtE, .XZeroE, .Funct3E, .QeM, .X, .D, .CyclesE,
+    // Int-specific 
+    .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
+    .BZeroM, .nM, .mM, .AM, 
+    .IntDivM, .W64M, .NegQuotM, .ALTBM, .AsM);
+
+  fdivsqrtfsm fdivsqrtfsm(                                  // FSM
+    .clk, .reset, .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, 
+    .FDivStartE, .XsE, .SqrtE, .WZeroE, .FlushE, .StallM, 
+    .FDivBusyE, .IFDivStartE, .FDivDoneE, .SpecialCaseM, .CyclesE,
+    // Int-specific 
+    .IDivStartE, .ISpecialCaseE, .IntDivE);
+
+  fdivsqrtiter fdivsqrtiter(                                // CSA Iterator
+    .clk, .IFDivStartE, .FDivBusyE, .SqrtE, .X, .D, 
+    .FirstU, .FirstUM, .FirstC, .Firstun, .FirstWS(WS), .FirstWC(WC));
+
+  fdivsqrtpostproc fdivsqrtpostproc(                        // Postprocessor
+    .clk, .reset, .StallM, .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, 
+    .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
+    .QmM, .WZeroE, .DivStickyM, 
+    // Int-specific 
+    .nM, .mM, .ALTBM, .AsM, .BZeroM, .NegQuotM, .W64M, .RemOpM(Funct3M[1]), .AM, 
+    .FIntDivResultM);
+endmodule
+
--- a/src/fpu/divremsqrt/divremsqrtpostprocess.sv
+++ b/src/fpu/divremsqrt/divremsqrtpostprocess.sv
@ -0,0 +1,231 @@
+///////////////////////////////////////////
+// postprocess.sv
+//
+// Written: kekim@hmc.edu
+// Modified: 19 May 2023
+//
+// Purpose: Post-Processing: normalization, rounding, sign, flags, special cases
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module divremsqrtpostprocess (
+  // general signals
+  input logic                             Xs, Ys,     // input signs
+  input logic  [`NF:0]                    Xm, Ym, Zm, // input mantissas
+  input logic  [2:0]                      Frm,        // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+  input logic  [`FMTBITS-1:0]             Fmt,        // precision 1 = double 0 = single
+  input logic  [2:0]                      OpCtrl,     // choose which opperation (look below for values)
+  input logic                             XZero, YZero,        // inputs are zero
+  input logic                             XInf, YInf, ZInf,    // inputs are infinity
+  input logic                             XNaN, YNaN, ZNaN,    // inputs are NaN
+  input logic                             XSNaN, YSNaN, ZSNaN, // inputs are signaling NaNs
+  input logic  [1:0]                      PostProcSel,         // select result to be written to fp register
+  //fma signals
+  input logic                             FmaAs,      // the modified Z sign - depends on instruction
+  input logic                             FmaPs,      // the product's sign
+  input logic                             FmaSs,      // Sum sign
+  input logic  [`NE+1:0]                  FmaSe,      // the sum's exponent
+  input logic  [3*`NF+3:0]                FmaSm,      // the positive sum
+  input logic                             FmaASticky, // sticky bit that is calculated during alignment
+  input logic  [$clog2(3*`NF+5)-1:0]      FmaSCnt,    // the normalization shift count
+  //divide signals
+  input logic                             DivSticky,  // divider sticky bit
+  input logic  [`NE+1:0]                  DivQe,      // divsqrt exponent
+  input logic  [`DIVb:0]                  DivQm,      // divsqrt significand
+  // conversion signals
+  input logic                             CvtCs,      // the result's sign
+  input logic  [`NE:0]                    CvtCe,      // the calculated expoent
+  input logic                             CvtResSubnormUf, // the convert result is subnormal or underflows
+  input logic  [`LOGCVTLEN-1:0]           CvtShiftAmt,// how much to shift by
+  input logic                             ToInt,      // is fp->int (since it's writting to the integer register)
+  input logic  [`CVTLEN-1:0]              CvtLzcIn,   // input to the Leading Zero Counter (without msb)
+  input logic                             IntZero,    // is the integer input zero
+  // final results
+  output logic [`FLEN-1:0]                PostProcRes,// postprocessor final result
+  output logic [4:0]                      PostProcFlg,// postprocesser flags
+  output logic [`XLEN-1:0]                FCvtIntRes  // the integer conversion result
+  );
+  
+  // general signals
+  logic                       Rs;         // result sign
+  logic [`NF-1:0]             Rf;         // Result fraction
+  logic [`NE-1:0]             Re;         // Result exponent
+  logic                       Ms;         // norMalized sign
+  logic [`CORRSHIFTSZ-1:0]    Mf;         // norMalized fraction
+  logic [`NE+1:0]             Me;         // normalized exponent
+  logic [`NE+1:0]             FullRe;     // Re with bits to determine sign and overflow
+  logic                       UfPlus1;    // do you add one (for determining underflow flag)
+  logic [`LOGNORMSHIFTSZ-1:0] ShiftAmt;   // normalization shift amount
+  logic [`NORMSHIFTSZ-1:0]    ShiftIn;    // input to normalization shift
+  logic [`NORMSHIFTSZ-1:0]    Shifted;    // the ouput of the normalized shifter (before shift correction)
+  logic                       Plus1;      // add one to the final result?
+  logic                       Overflow;   // overflow flag used to select results
+  logic                       Invalid;    // invalid flag used to select results
+  logic                       Guard, Round, Sticky; // bits needed to determine rounding
+  logic [`FMTBITS-1:0]        OutFmt;     // output format
+  // fma signals
+  logic [`NE+1:0]             FmaMe;      // exponent of the normalized sum
+  logic                       FmaSZero;   // is the sum zero
+  logic [3*`NF+5:0]           FmaShiftIn; // fma shift input
+  logic [`NE+1:0]             NormSumExp; // exponent of the normalized sum not taking into account Subnormal or zero results
+  logic                       FmaPreResultSubnorm; // is the result subnormal - calculated before LZA corection
+  logic [$clog2(3*`NF+5)-1:0] FmaShiftAmt;// normalization shift amount for fma
+  // division singals
+  logic [`LOGNORMSHIFTSZ-1:0] DivShiftAmt;        // divsqrt shif amount
+  logic [`NORMSHIFTSZ-1:0]    DivShiftIn;         // divsqrt shift input
+  logic [`NE+1:0]             Qe;                 // divsqrt corrected exponent after corretion shift
+  logic                       DivByZero;          // divide by zero flag
+  logic                       DivResSubnorm;      // is the divsqrt result subnormal
+  logic                       DivSubnormShiftPos; // is the divsqrt subnorm shift amout positive (not underflowed)
+  // conversion signals
+  logic [`CVTLEN+`NF:0]       CvtShiftIn;         // number to be shifted for converter
+  logic [1:0]                 CvtNegResMsbs;      // most significant bits of possibly negated int result
+  logic [`XLEN+1:0]           CvtNegRes;          // possibly negated integer result
+  logic                       CvtResUf;           // did the convert result underflow
+  logic                       IntInvalid;         // invalid integer flag
+  // readability signals
+  logic                       Mult;       // multiply opperation
+  logic                       Sqrt;       // is the divsqrt opperation sqrt
+  logic                       Int64;      // is the integer 64 bits?
+  logic                       Signed;     // is the opperation with a signed integer?
+  logic                       IntToFp;    // is the opperation an int->fp conversion?
+  logic                       CvtOp;      // convertion opperation
+  logic                       FmaOp;      // fma opperation
+  logic                       DivOp;      // divider opperation
+  logic                       InfIn;      // are any of the inputs infinity
+  logic                       NaNIn;      // are any of the inputs NaN
+
+  // signals to help readability
+  //assign Signed =  OpCtrl[0];
+  //assign Int64 =   OpCtrl[1];
+  //assign IntToFp = OpCtrl[2];
+  //assign Mult = OpCtrl[2]&~OpCtrl[1]&~OpCtrl[0];
+  //assign CvtOp = (PostProcSel == 2'b00);
+  //assign FmaOp = (PostProcSel == 2'b10);
+  assign DivOp = (PostProcSel == 2'b01);
+  assign Sqrt =  OpCtrl[0];
+
+  // is there an input of infinity or NaN being used
+  assign InfIn = XInf|YInf|ZInf;
+  assign NaNIn = XNaN|YNaN|ZNaN;
+
+  // choose the ouptut format depending on the opperation
+  //      - fp -> fp: OpCtrl contains the percision of the output
+  //      - otherwise: Fmt contains the percision of the output
+  if (`FPSIZES == 2) 
+      //assign OutFmt = IntToFp|~CvtOp ? Fmt : (OpCtrl[1:0] == `FMT); 
+      assign OutFmt = Fmt;
+  else if (`FPSIZES == 3 | `FPSIZES == 4) 
+      //assign OutFmt = IntToFp|~CvtOp ? Fmt : OpCtrl[1:0]; 
+      assign OutFmt = Fmt;
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Normalization
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // final claulations before shifting
+  /*cvtshiftcalc cvtshiftcalc(.ToInt, .CvtCe, .CvtResSubnormUf, .Xm, .CvtLzcIn,  
+      .XZero, .IntToFp, .OutFmt, .CvtResUf, .CvtShiftIn);*/
+
+  /*fmashiftcalc fmashiftcalc(.FmaSm, .FmaSCnt, .Fmt, .NormSumExp, .FmaSe,
+      .FmaSZero, .FmaPreResultSubnorm, .FmaShiftAmt, .FmaShiftIn);*/
+
+  divshiftcalc divshiftcalc(.DivQe, .DivQm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
+
+  assign ShiftAmt = DivShiftAmt;
+  assign ShiftIn = DivShiftIn;
+  /*
+  // select which unit's output to shift
+  always_comb
+    case(PostProcSel)
+      2'b10: begin // fma
+        ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(3*`NF+5){1'b0}}, FmaShiftAmt};
+        ShiftIn =  {FmaShiftIn, {`NORMSHIFTSZ-(3*`NF+6){1'b0}}};
+      end
+      2'b00: begin // cvt
+        ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(`CVTLEN+1){1'b0}}, CvtShiftAmt};
+        ShiftIn =  {CvtShiftIn, {`NORMSHIFTSZ-`CVTLEN-`NF-1{1'b0}}};
+      end
+      2'b01: begin //divsqrt
+        ShiftAmt = DivShiftAmt;
+        ShiftIn =  DivShiftIn;
+      end
+      default: begin 
+        ShiftAmt = {`LOGNORMSHIFTSZ{1'bx}}; 
+        ShiftIn = {`NORMSHIFTSZ{1'bx}}; 
+      end
+    endcase
+  */
+  
+  // main normalization shift
+  normshift normshift (.ShiftIn, .ShiftAmt, .Shifted);
+
+  // correct for LZA/divsqrt error
+  divremsqrtshiftcorrection shiftcorrection(.DivResSubnorm, .DivSubnormShiftPos, .DivOp, .DivQe, .Qe, .Shifted, .Mf);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Rounding
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // round to nearest even
+  // round to zero
+  // round to -infinity
+  // round to infinity
+  // round to nearest max magnitude
+
+  // calulate result sign used in rounding unit
+  divremsqrtroundsign roundsign(.FmaOp, .DivOp, .CvtOp, .Sqrt, .FmaSs, .Xs, .Ys, .CvtCs, .Ms);
+
+  round round(.OutFmt, .Frm, .FmaASticky, .Plus1, .PostProcSel, .CvtCe, .Qe,
+      .Ms, .FmaMe, .FmaOp, .CvtOp, .CvtResSubnormUf, .Mf, .ToInt,  .CvtResUf,
+      .DivSticky, .DivOp, .UfPlus1, .FullRe, .Rf, .Re, .Sticky, .Round, .Guard, .Me);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Sign calculation
+  ///////////////////////////////////////////////////////////////////////////////
+
+  /*resultsign resultsign(.Frm, .FmaPs, .FmaAs, .Round, .Sticky, .Guard,
+      .FmaOp, .ZInf, .InfIn, .FmaSZero, .Mult, .Ms, .Rs);*/
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Flags
+  ///////////////////////////////////////////////////////////////////////////////
+
+  flags flags(.XSNaN, .YSNaN, .ZSNaN, .XInf, .YInf, .ZInf, .InfIn, .XZero, .YZero, 
+              .Xs, .Sqrt, .ToInt, .IntToFp, .Int64, .Signed, .OutFmt, .CvtCe,
+              .NaNIn, .FmaAs, .FmaPs, .Round, .IntInvalid, .DivByZero,
+              .Guard, .Sticky, .UfPlus1, .CvtOp, .DivOp, .FmaOp, .FullRe, .Plus1,
+              .Me, .CvtNegResMsbs, .Invalid, .Overflow, .PostProcFlg);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Select the result
+  ///////////////////////////////////////////////////////////////////////////////
+
+  negateintres negateintres(.Xs, .Shifted, .Signed, .Int64, .Plus1, .CvtNegResMsbs, .CvtNegRes);
+
+  specialcase specialcase(.Xs, .Xm, .Ym, .Zm, .XZero, .IntInvalid,
+      .IntZero, .Frm, .OutFmt, .XNaN, .YNaN, .ZNaN, .CvtResUf, 
+      .NaNIn, .IntToFp, .Int64, .Signed, .CvtOp, .FmaOp, .Plus1, .Invalid, .Overflow, .InfIn, .CvtNegRes,
+      .XInf, .YInf, .DivOp, .DivByZero, .FullRe, .CvtCe, .Rs, .Re, .Rf, .PostProcRes, .FCvtIntRes);
+
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtround.sv
+++ b/src/fpu/divremsqrt/divremsqrtround.sv
@ -0,0 +1,339 @@
+///////////////////////////////////////////
+// divremsqrtround.sv
+//
+// Written: kekim@hmc.edu
+// Modified: 19 May 2023
+//
+// Purpose: Rounder
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+// what position is XLEN in?
+//  options: 
+//     1: XLEN > NF   > NF1
+//     2: NF   > XLEN > NF1
+//     3: NF   > NF1  > XLEN
+//  single and double will always be smaller than XLEN
+`define XLENPOS ((`XLEN>`NF) ? 1 : (`XLEN>`NF1) ? 2 : 3)
+
+module round(
+  input  logic [`FMTBITS-1:0]     OutFmt,             // output format
+  input  logic [2:0]              Frm,                // rounding mode
+  //input  logic [1:0]              PostProcSel,        // select the postprocessor output
+  input  logic                    Ms,                 // normalized sign
+  input  logic [`CORRSHIFTSZ-1:0] Mf,                 // normalized fraction
+  // fma
+  //input  logic                    FmaOp,              // is an fma opperation being done?
+  //input  logic [`NE+1:0]          FmaMe,              // exponent of the normalized sum for fma
+  //input  logic                    FmaASticky,         // addend's sticky bit
+
+  // divsqrt
+  //input  logic                    DivOp,              // is a division opperation being done
+  input  logic                    DivSticky,          // divsqrt sticky bit
+  input  logic [`NE+1:0]          Qe,                 // the divsqrt calculated expoent
+  // cvt
+  input  logic                    CvtOp,              // is a convert opperation being done
+  input  logic                    ToInt,              // is the cvt op a cvt to integer
+  input  logic                    CvtResSubnormUf,    // is the cvt result subnormal or underflow
+  input  logic                    CvtResUf,           // does the cvt result underflow
+  input  logic [`NE:0]            CvtCe,              // the cvt calculated expoent
+  // outputs
+  output logic [`NE+1:0]          Me,                 // normalied fraction
+  output logic                    UfPlus1,            // do you add one to the result if given an unbounded exponent
+  output logic [`NE+1:0]          FullRe,             // Re with bits to determine sign and overflow
+  output logic [`NE-1:0]          Re,                 // Result exponent
+  output logic [`NF-1:0]          Rf,                 // Result fractionNormS
+  output logic                    Sticky,             // sticky bit
+  output logic                    Plus1,              // do you add one to the final result
+  output logic                    Round, Guard        // bits needed to calculate rounding
+);
+
+  logic           UfCalcPlus1;        // calculated plus one for unbounded exponent
+  logic           NormSticky;         // normalized sum's sticky bit
+  logic [`NF-1:0] RoundFrac;          // rounded fraction
+  logic           FpRes;              // is the result a floating point
+  logic           IntRes;             // is the result an integer
+  logic           FpGuard, FpRound;   // floating point round/guard bits
+  logic           FpLsbRes;           // least significant bit of floating point result
+  logic           LsbRes;             // lsb of result
+  logic           CalcPlus1;          // calculated plus1
+  logic           FpPlus1;            // do you add one to the fp result 
+  logic [`FLEN:0] RoundAdd;           // how much to add to the result
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Rounding
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // round to nearest even
+  //      {Round, Sticky}
+  //      0x - do nothing
+  //      10 - tie - Plus1 if result is odd  (LSBNormSum = 1)
+  //          - don't add 1 if a small number was supposed to be subtracted
+  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
+  //         - plus 1 otherwise
+
+  //  round to zero - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
+
+  //  round to -infinity
+  //          - Plus1 if negative unless a small number was supposed to be subtracted from a result with guard and round bits of 0
+  //          - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
+
+  //  round to infinity
+  //          - Plus1 if positive unless a small number was supposed to be subtracted from a result with guard and round bits of 0
+  //          - subtract 1 if a small number was supposed to be subtracted from a negative result with guard and round bits of 0
+
+  //  round to nearest max magnitude
+  //      {Guard, Round, Sticky}
+  //      0x - do nothing
+  //      10 - tie - Plus1
+  //          - don't add 1 if a small number was supposed to be subtracted
+  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
+  //         - Plus 1 otherwise
+
+
+  // determine what format the final result is in: int or fp
+  assign IntRes = ToInt;
+  assign FpRes = ~IntRes;
+
+  // sticky bit calculation
+  if (`FPSIZES == 1) begin
+
+      //     1: XLEN > NF
+      //      |         XLEN          |
+      //      |    NF     |1|1|
+      //                     ^    ^ if floating point result
+      //                     ^ if not an FMA result
+      if (`XLENPOS == 1)assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
+                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:0]);
+      //     2: NF > XLEN
+      if (`XLENPOS == 2)assign NormSticky = (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&IntRes) |
+                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
+
+  end else if (`FPSIZES == 2) begin
+      // XLEN is either 64 or 32
+      // so half and single are always smaller then XLEN
+
+      // 1: XLEN > NF   > NF1
+      if (`XLENPOS == 1) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&FpRes&~OutFmt) |
+                                                (|Mf[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
+                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:0]);
+      // 2: NF   > XLEN > NF1
+      if (`XLENPOS == 2) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~OutFmt) | 
+                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&(IntRes|~OutFmt)) |
+                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
+      // 3: NF   > NF1  > XLEN
+      if (`XLENPOS == 3) assign NormSticky = (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF1-1]&IntRes) |
+                                                (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&(~OutFmt|IntRes)) |
+                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
+
+  end else if (`FPSIZES == 3) begin
+      // 1: XLEN > NF   > NF1
+      if (`XLENPOS == 1) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`NF1-1]&FpRes&(OutFmt==`FMT1)) |
+                                                (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&FpRes&~(OutFmt==`FMT)) |
+                                                (|Mf[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
+                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:0]);
+      // 2: NF   > XLEN > NF1
+      if (`XLENPOS == 2) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`NF1-1]&FpRes&(OutFmt==`FMT1)) |
+                                                (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~(OutFmt==`FMT)) | 
+                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&(IntRes|~(OutFmt==`FMT))) |
+                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
+      // 3: NF   > NF1  > XLEN
+      if (`XLENPOS == 3) assign NormSticky = (|Mf[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&(OutFmt==`FMT1)) |
+                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF1-1]&((OutFmt==`FMT1)|IntRes)) |
+                                                (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&(~(OutFmt==`FMT)|IntRes)) |
+                                                (|Mf[`CORRSHIFTSZ-`NF-2:0]);
+
+  end else if (`FPSIZES == 4) begin
+      // Quad precision will always be greater than XLEN
+      // 2: NF   > XLEN > NF1
+      if (`XLENPOS == 2) assign NormSticky = (|Mf[`CORRSHIFTSZ-`H_NF-2:`CORRSHIFTSZ-`S_NF-1]&FpRes&(OutFmt==`H_FMT)) |
+                                                (|Mf[`CORRSHIFTSZ-`S_NF-2:`CORRSHIFTSZ-`D_NF-1]&FpRes&((OutFmt==`S_FMT)|(OutFmt==`H_FMT))) | 
+                                                (|Mf[`CORRSHIFTSZ-`D_NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~(OutFmt==`Q_FMT)) | 
+                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`Q_NF-1]&(~(OutFmt==`Q_FMT)|IntRes)) |
+                                                (|Mf[`CORRSHIFTSZ-`Q_NF-2:0]);
+      // 3: NF   > NF1  > XLEN
+      // The extra XLEN bit will be ored later when caculating the final sticky bit - the ufplus1 not needed for integer
+      if (`XLENPOS == 3) assign NormSticky = (|Mf[`CORRSHIFTSZ-`H_NF-2:`CORRSHIFTSZ-`S_NF-1]&FpRes&(OutFmt==`H_FMT)) |
+                                                (|Mf[`CORRSHIFTSZ-`S_NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&((OutFmt==`S_FMT)|(OutFmt==`H_FMT))) |
+                                                (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`D_NF-1]&((OutFmt==`S_FMT)|(OutFmt==`H_FMT)|IntRes)) |
+                                                (|Mf[`CORRSHIFTSZ-`D_NF-2:`CORRSHIFTSZ-`Q_NF-1]&(~(OutFmt==`Q_FMT)|IntRes)) |
+                                                (|Mf[`CORRSHIFTSZ-`Q_NF-2:0]);
+
+  end
+  
+
+
+  // only add the Addend sticky if doing an FMA opperation
+  //      - the shifter shifts too far left when there's an underflow (shifting out all possible sticky bits)
+  //assign Sticky = FmaASticky&FmaOp | NormSticky | CvtResUf&CvtOp | FmaMe[`NE+1]&FmaOp | DivSticky&DivOp;
+  assign Sticky = DivSticky;
+  
+
+
+
+  // determine round and LSB of the rounded value
+  //      - underflow round bit is used to determint the underflow flag
+  if (`FPSIZES == 1) begin
+      assign FpGuard = Mf[`CORRSHIFTSZ-`NF-1];
+      assign FpLsbRes = Mf[`CORRSHIFTSZ-`NF];
+      assign FpRound = Mf[`CORRSHIFTSZ-`NF-2];
+
+  end else if (`FPSIZES == 2) begin
+      assign FpGuard = OutFmt ? Mf[`CORRSHIFTSZ-`NF-1] : Mf[`CORRSHIFTSZ-`NF1-1];
+      assign FpLsbRes = OutFmt ? Mf[`CORRSHIFTSZ-`NF] : Mf[`CORRSHIFTSZ-`NF1];
+      assign FpRound = OutFmt ? Mf[`CORRSHIFTSZ-`NF-2] : Mf[`CORRSHIFTSZ-`NF1-2];
+
+  end else if (`FPSIZES == 3) begin
+      always_comb
+          case (OutFmt)
+              `FMT: begin
+                  FpGuard = Mf[`CORRSHIFTSZ-`NF-1];
+                  FpLsbRes = Mf[`CORRSHIFTSZ-`NF];
+                  FpRound = Mf[`CORRSHIFTSZ-`NF-2];
+              end
+              `FMT1: begin
+                  FpGuard = Mf[`CORRSHIFTSZ-`NF1-1];
+                  FpLsbRes = Mf[`CORRSHIFTSZ-`NF1];
+                  FpRound = Mf[`CORRSHIFTSZ-`NF1-2];
+              end
+              `FMT2: begin
+                  FpGuard = Mf[`CORRSHIFTSZ-`NF2-1];
+                  FpLsbRes = Mf[`CORRSHIFTSZ-`NF2];
+                  FpRound = Mf[`CORRSHIFTSZ-`NF2-2];
+              end
+              default: begin
+                  FpGuard = 1'bx;
+                  FpLsbRes = 1'bx;
+                  FpRound = 1'bx;
+              end
+          endcase
+  end else if (`FPSIZES == 4) begin
+      always_comb
+          case (OutFmt)
+              2'h3: begin
+                  FpGuard = Mf[`CORRSHIFTSZ-`Q_NF-1];
+                  FpLsbRes = Mf[`CORRSHIFTSZ-`Q_NF];
+                  FpRound = Mf[`CORRSHIFTSZ-`Q_NF-2];
+              end
+              2'h1: begin
+                  FpGuard = Mf[`CORRSHIFTSZ-`D_NF-1];
+                  FpLsbRes = Mf[`CORRSHIFTSZ-`D_NF];
+                  FpRound = Mf[`CORRSHIFTSZ-`D_NF-2];
+              end
+              2'h0: begin
+                  FpGuard = Mf[`CORRSHIFTSZ-`S_NF-1];
+                  FpLsbRes = Mf[`CORRSHIFTSZ-`S_NF];
+                  FpRound = Mf[`CORRSHIFTSZ-`S_NF-2];
+              end
+              2'h2: begin
+                  FpGuard = Mf[`CORRSHIFTSZ-`H_NF-1];
+                  FpLsbRes = Mf[`CORRSHIFTSZ-`H_NF];
+                  FpRound = Mf[`CORRSHIFTSZ-`H_NF-2];
+              end
+          endcase
+  end
+
+  /*assign Guard = ToInt&CvtOp ? Mf[`CORRSHIFTSZ-`XLEN-1] : FpGuard;
+  assign LsbRes = ToInt&CvtOp ? Mf[`CORRSHIFTSZ-`XLEN] : FpLsbRes;
+  assign Round = ToInt&CvtOp ? Mf[`CORRSHIFTSZ-`XLEN-2] : FpRound;*/
+  
+  assign Guard =  FpGuard;
+  assign LsbRes = FpLsbRes;
+  assign Round =  FpRound;
+
+
+  always_comb begin
+      // Determine if you add 1
+      case (Frm)
+          3'b000: CalcPlus1 = Guard & (Round|Sticky|LsbRes);//round to nearest even
+          3'b001: CalcPlus1 = 0;//round to zero
+          3'b010: CalcPlus1 = Ms;//round down
+          3'b011: CalcPlus1 = ~Ms;//round up
+          3'b100: CalcPlus1 = Guard;//round to nearest max magnitude
+          default: CalcPlus1 = 1'bx;
+      endcase
+      // Determine if you add 1 (for underflow flag)
+      case (Frm)
+          3'b000: UfCalcPlus1 = Round & (Sticky|Guard);//round to nearest even
+          3'b001: UfCalcPlus1 = 0;//round to zero
+          3'b010: UfCalcPlus1 = Ms;//round down
+          3'b011: UfCalcPlus1 = ~Ms;//round up
+          3'b100: UfCalcPlus1 = Round;//round to nearest max magnitude
+          default: UfCalcPlus1 = 1'bx;
+      endcase
+  
+  end
+
+  // If an answer is exact don't round
+  assign Plus1 = CalcPlus1 & (Sticky|Round|Guard);
+  //assign FpPlus1 = Plus1&~(ToInt&CvtOp);
+  assign FpPlus1 = Plus1;
+  assign UfPlus1 = UfCalcPlus1 & (Sticky|Round);
+
+
+
+
+  // place Plus1 into the proper position for the format
+  if (`FPSIZES == 1) begin
+      assign RoundAdd = {{`FLEN{1'b0}}, FpPlus1};
+
+  end else if (`FPSIZES == 2) begin
+      // \/FLEN+1
+      //  | NE+2 |        NF      |
+      //  '-NE+2-^----NF1----^
+      // `FLEN+1-`NE-2-`NF1 = FLEN-1-NE-NF1
+      assign RoundAdd = {(`NE+1+`NF1)'(0), FpPlus1&~OutFmt, (`NF-`NF1-1)'(0), FpPlus1&OutFmt};
+
+  end else if (`FPSIZES == 3) begin
+      assign RoundAdd = {(`NE+1+`NF2)'(0), FpPlus1&(OutFmt==`FMT2), (`NF1-`NF2-1)'(0), FpPlus1&(OutFmt==`FMT1), (`NF-`NF1-1)'(0), FpPlus1&(OutFmt==`FMT)};
+
+  end else if (`FPSIZES == 4)      
+      assign RoundAdd = {(`Q_NE+1+`H_NF)'(0), FpPlus1&(OutFmt==`H_FMT), (`S_NF-`H_NF-1)'(0), FpPlus1&(OutFmt==`S_FMT), (`D_NF-`S_NF-1)'(0), FpPlus1&(OutFmt==`D_FMT), (`Q_NF-`D_NF-1)'(0), FpPlus1&(OutFmt==`Q_FMT)};
+
+
+
+  // trim unneeded bits from fraction
+  assign RoundFrac = Mf[`CORRSHIFTSZ-1:`CORRSHIFTSZ-`NF];
+  
+
+
+  // select the exponent
+  assign Me = Qe;
+  /*always_comb
+      case(PostProcSel)
+          2'b10: Me = FmaMe; // fma
+          2'b00: Me = {CvtCe[`NE], CvtCe}&{`NE+2{~CvtResSubnormUf|CvtResUf}}; // cvt
+          // 2'b01: Me = DivDone ? Qe : '0; // divide
+          2'b01: Me = Qe; // divide
+          default: Me = '0; 
+      endcase*/
+
+
+
+  // round the result
+  //      - if the fraction overflows one should be added to the exponent
+  assign {FullRe, Rf} = {Me, RoundFrac} + RoundAdd;
+  assign Re = FullRe[`NE-1:0];
+
+
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtroundsign.sv
+++ b/src/fpu/divremsqrt/divremsqrtroundsign.sv
@ -0,0 +1,46 @@
+///////////////////////////////////////////
+// divremsqrtroundsign.sv
+//
+// Written: kekim@hmc.edu,me@KatherineParry.com
+// Modified: 19 May 2023
+//
+// Purpose: Sign calculation for rounding
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+`include "wally-config.vh"
+
+module roundsign(
+  input logic         Xs,     // x sign
+  input logic         Ys,     // y sign
+  input logic         Sqrt,   // sqrt oppertion? (when using divsqrt unit)
+  input logic         DivOp,  // is divsqrt opperation
+  output logic        Ms      // normalized result sign
+);
+
+  logic               Qs;     // divsqrt result sign
+
+  // calculate divsqrt sign
+  assign Qs = Xs^(Ys&~Sqrt);
+
+  // Select sign for rounding calulation
+  assign Ms = (Qs&DivOp);
+
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtshiftcorrection.sv
+++ b/src/fpu/divremsqrt/divremsqrtshiftcorrection.sv
@ -0,0 +1,93 @@
+///////////////////////////////////////////
+// divremsqrtshiftcorrection.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: shift correction
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module divremsqrtshiftcorrection(
+  input logic  [`NORMSHIFTSZ-1:0] Shifted,                // the shifted sum before LZA correction
+  // divsqrt
+  input logic                     DivOp,                  // is it a divsqrt opperation
+  input logic                     DivResSubnorm,          // is the divsqrt result subnormal
+  input logic  [`NE+1:0]          DivQe,                  // the divsqrt result's exponent
+  input logic                     DivSubnormShiftPos,     // is the subnorm divider shift amount positive (ie not underflowed)
+  //fma
+  //input logic                     FmaOp,                  // is it an fma opperation
+  //input logic  [`NE+1:0]          NormSumExp,             // exponent of the normalized sum not taking into account Subnormal or zero results
+  //input logic                     FmaPreResultSubnorm,    // is the result subnormal - calculated before LZA corection
+  //input logic                     FmaSZero,
+  // output
+  //output logic [`NE+1:0]          FmaMe,                  // exponent of the normalized sum
+  output logic [`CORRSHIFTSZ-1:0] Mf,                     // the shifted sum before LZA correction
+  output logic [`NE+1:0]          Qe                      // corrected exponent for divider
+);
+
+  logic [3*`NF+3:0]           CorrSumShifted;             // the shifted sum after LZA correction
+  logic [`CORRSHIFTSZ-1:0]    CorrQm0, CorrQm1;           // portions of Shifted to select for CorrQmShifted
+  logic [`CORRSHIFTSZ-1:0]    CorrQmShifted;              // the shifted divsqrt result after one bit shift
+  logic                       ResSubnorm;                 // is the result Subnormal
+  logic                       LZAPlus1;                   // add one or two to the sum's exponent due to LZA correction
+  logic                       LeftShiftQm;                // should the divsqrt result be shifted one to the left
+
+  // LZA correction
+  assign LZAPlus1 = Shifted[`NORMSHIFTSZ-1];
+
+  // correct the shifting error caused by the LZA
+  //  - the only possible mantissa for a plus two is all zeroes 
+  //      - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
+  mux2 #(`NORMSHIFTSZ-2) lzacorrmux(Shifted[`NORMSHIFTSZ-3:0], Shifted[`NORMSHIFTSZ-2:1], LZAPlus1, CorrSumShifted);
+
+  // correct the shifting of the divsqrt caused by producing a result in (2, .5] range
+  //    condition: if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Subnorm)
+  assign LeftShiftQm = (LZAPlus1|(DivQe==1&~LZAPlus1));
+  assign CorrQm0 = Shifted[`NORMSHIFTSZ-3:`NORMSHIFTSZ-`CORRSHIFTSZ-2];
+  assign CorrQm1 = Shifted[`NORMSHIFTSZ-2:`NORMSHIFTSZ-`CORRSHIFTSZ-1];
+  mux2 #(`CORRSHIFTSZ) divcorrmux(CorrQm0, CorrQm1, LeftShiftQm, CorrQmShifted);
+  
+  // if the result of the divider was calculated to be subnormal, then the result was correctly normalized, so select the top shifted bits
+  always_comb
+    //if(FmaOp)                       Mf = {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+4){1'b0}}};
+    if (DivOp&~DivResSubnorm)  Mf = CorrQmShifted;
+    else                       Mf = Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ];
+    
+  // Determine sum's exponent
+  //  main exponent issues: 
+  //      - LZA was one too large
+  //      - LZA was two too large
+  //      - if the result was calulated to be subnorm but it's norm and the LZA was off by 1
+  //      - if the result was calulated to be subnorm but it's norm and the LZA was off by 2
+  //                          if plus1                    If plus2                               kill if the result Zero or actually subnormal
+  //                          |                           |                                      |
+  //assign FmaMe = (NormSumExp+{{`NE+1{1'b0}}, LZAPlus1} +{{`NE+1{1'b0}}, FmaPreResultSubnorm}) & {`NE+2{~(FmaSZero|ResSubnorm)}};
+  
+  // recalculate if the result is subnormal after LZA correction
+  //assign ResSubnorm = FmaPreResultSubnorm&~Shifted[`NORMSHIFTSZ-2]&~Shifted[`NORMSHIFTSZ-1];
+
+  // the quotent is in the range [.5,2) if there is no early termination
+  // if the quotent < 1 and not Subnormal then subtract 1 to account for the normalization shift
+  assign Qe = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivQe - {(`NE+1)'(0), ~LZAPlus1};
+endmodule