From f8af51e07be4c2ae7c7457693957dbe199e1e5f9 Mon Sep 17 00:00:00 2001 From: David Harris Date: Sun, 1 Jan 2023 13:54:01 -0800 Subject: [PATCH] Handle special case Int Div/Rem of |A| < |B| in a single cycle --- pipelined/config/shared/wally-shared.vh | 2 +- pipelined/src/fpu/fdivsqrt/fdivsqrt.sv | 10 +++++----- pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv | 5 +++-- pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 11 ++++++----- pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 16 ++++++++++------ pipelined/src/ieu/forward.sv | 2 +- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh index e79e5561..7d54f50b 100644 --- a/pipelined/config/shared/wally-shared.vh +++ b/pipelined/config/shared/wally-shared.vh @@ -24,7 +24,7 @@ // division constants `define RADIX 32'h2 -`define DIVCOPIES 32'h2 +`define DIVCOPIES 32'h1 // Memory synthesis configuration `define USE_SRAM 0 diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv index de51eeab..dc07078f 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv @@ -68,20 +68,20 @@ module fdivsqrt( logic DivStartE; // Enable signal for flops during stall // Integer div/rem signals - logic AZeroE, BZeroE; // Numerator/Denominator is zero (Execute) - logic AZeroM, BZeroM; // Numerator/Denominator is zero (Memory) + logic BZeroE, BZeroM; // Denominator is zero logic MDUM; // Integer operation logic [`DIVBLEN:0] nE, nM, mM; // Shift amounts logic NegQuotM, ALTBM, AsM, W64M; // Special handling for postprocessor logic [`XLEN-1:0] AM; // Original Numerator for postprocessor + logic ISpecialCaseE; // Integer div/remainder special cases fdivsqrtpreproc fdivsqrtpreproc( // Preprocessor .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE), .Fmt(FmtE), .Sqrt(SqrtE), .XZeroE, .Funct3E, .QeM, .X, .DPreproc, // Int-specific - .ForwardedSrcAE, .ForwardedSrcBE, .MDUE, .W64E, - .AZeroE, .BZeroE, .nE, .AZeroM, .BZeroM, .nM, .mM, .AM, + .ForwardedSrcAE, .ForwardedSrcBE, .MDUE, .W64E, .ISpecialCaseE, + .BZeroE, .nE, .BZeroM, .nM, .mM, .AM, .MDUM, .W64M, .NegQuotM, .ALTBM, .AsM); fdivsqrtfsm fdivsqrtfsm( // FSM @@ -89,7 +89,7 @@ module fdivsqrt( .FDivStartE, .XsE, .SqrtE, .WZeroE, .FlushE, .StallM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .SpecialCaseM, // Int-specific - .IDivStartE, .AZeroE, .BZeroE, .nE, .MDUE); + .IDivStartE, .BZeroE, .ISpecialCaseE, .nE, .MDUE); fdivsqrtiter fdivsqrtiter( // CSA Iterator .clk, .IFDivStartE, .FDivBusyE, .SqrtE, .X, .DPreproc, diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv index 9e42cadb..6c1348d6 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv @@ -36,7 +36,7 @@ module fdivsqrtfsm( input logic [`FMTBITS-1:0] FmtE, input logic XInfE, YInfE, input logic XZeroE, YZeroE, - input logic AZeroE, BZeroE, + input logic BZeroE, input logic XNaNE, YNaNE, input logic FDivStartE, IDivStartE, input logic XsE, @@ -46,6 +46,7 @@ module fdivsqrtfsm( input logic WZeroE, input logic MDUE, input logic [`DIVBLEN:0] nE, + input logic ISpecialCaseE, output logic IFDivStartE, output logic FDivBusyE, FDivDoneE, output logic SpecialCaseM @@ -65,7 +66,7 @@ module fdivsqrtfsm( // terminate immediately on special cases assign FSpecialCaseE = XZeroE | (YZeroE&~SqrtE) | XInfE | YInfE | XNaNE | YNaNE | (XsE&SqrtE); - if (`IDIV_ON_FPU) assign SpecialCaseE = MDUE ? BZeroE : FSpecialCaseE; + if (`IDIV_ON_FPU) assign SpecialCaseE = MDUE ? ISpecialCaseE : FSpecialCaseE; else assign SpecialCaseE = FSpecialCaseE; flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv index 75d6a323..e1012120 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv @@ -99,7 +99,7 @@ module fdivsqrtpostproc( mux2 #(`DIVb+1) preqmmux(FirstU, FirstUM, NegStickyM, PreQmM); // Select U or U-1 depending on negative sticky bit mux2 #(`DIVb+1) qmmux(PreQmM, (PreQmM << 1), SqrtM, QmM); - if (`IDIV_ON_FPU) begin // Int supported + if (`IDIV_ON_FPU) begin:intpostproc // Int supported logic [`DIVBLEN:0] NormShiftM; logic [`DIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM; @@ -121,18 +121,19 @@ module fdivsqrtpostproc( NormShiftM = ((`DIVBLEN+1)'(`DIVb) - (nM * (`DIVBLEN+1)'(`LOGR))); PreResultM = NormQuotM; end - PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM); + PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM); // *** rename to PreIntResultM? end // special case logic + // terminates immediately when B is Zero (div 0) or |A| has more leading 0s than |B| always_comb if (BZeroM) begin // Divide by zero - if (RemOpM) SpecialFPIntDivResultM = AM; + if (RemOpM) SpecialFPIntDivResultM = AM; // *** rename to IntDivResult? else SpecialFPIntDivResultM = {(`XLEN){1'b1}}; - end else if (ALTBM) begin // Numerator is zero + end else if (ALTBM) begin // Numerator is zero if (RemOpM) SpecialFPIntDivResultM = AM; else SpecialFPIntDivResultM = '0; - end else SpecialFPIntDivResultM = PreFPIntDivResultM[`XLEN-1:0]; + end else SpecialFPIntDivResultM = PreFPIntDivResultM[`XLEN-1:0]; // sign extend result for W64 if (`XLEN==64) begin diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv index 0b513211..968d7cbc 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv @@ -45,9 +45,10 @@ module fdivsqrtpreproc ( // Int-specific input logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B input logic MDUE, W64E, + output logic ISpecialCaseE, output logic [`DIVBLEN:0] nE, nM, mM, output logic NegQuotM, ALTBM, MDUM, W64M, - output logic AsM, AZeroM, BZeroM, AZeroE, BZeroE, + output logic AsM, BZeroM, BZeroE, output logic [`XLEN-1:0] AM ); @@ -58,8 +59,9 @@ module fdivsqrtpreproc ( logic [`DIVb-1:0] IFNormLenX, IFNormLenD; // Correctly-sized inputs for iterator logic [`DIVBLEN:0] mE, ell; // Leading zeros of inputs logic NumerZeroE; // Numerator is zero (X or A) + logic AZeroE; // A is Zero for integer division - if (`IDIV_ON_FPU) begin // Int Supported + if (`IDIV_ON_FPU) begin:intpreproc // Int Supported logic signedDiv, NegQuotE; logic AsBit, BsBit, AsE, BsE, ALTBE; logic [`XLEN-1:0] AE, BE, PosA, PosB; @@ -98,8 +100,11 @@ module fdivsqrtpreproc ( // calculate number of fractional bits p assign ZeroDiff = mE - ell; // Difference in number of leading zeros - assign ALTBE = ZeroDiff[`DIVBLEN]; // A less than B? - mux2 #(`DIVBLEN+1) pmux(ZeroDiff, 0, ALTBE, p); + assign ALTBE = ZeroDiff[`DIVBLEN]; // A less than B (A has more leading zeros) + mux2 #(`DIVBLEN+1) pmux(ZeroDiff, {(`DIVBLEN+1){1'b0}}, ALTBE, p); // *** is there a more graceful way to write these constants + + // Integer special cases (terminate immediately) + assign ISpecialCaseE = BZeroE | ALTBE; /* verilator lint_off WIDTH */ // calculate number of fractional digits nE and right shift amount RightShiftX to complete in discrete number of steps @@ -113,7 +118,7 @@ module fdivsqrtpreproc ( assign IntSteps = (TotalIntBits >> `LOGRK) + |IntTrunc; // Number of steps for int div assign nE = (IntSteps * `DIVCOPIES) - 1; // Fractional digits assign RightShiftX = `RK - 1 - ((TotalIntBits - 1) % `RK); // Right shift amount - assign DivXShifted = DivX >> RightShiftX; // shift X to complete in nE steps + assign DivXShifted = DivX >> RightShiftX; // shift X by up to R*K-1 to complete in nE steps end else begin // radix 2 1 copy doesn't require shifting assign nE = p; assign DivXShifted = DivX; @@ -129,7 +134,6 @@ module fdivsqrtpreproc ( flopen #(1) w64reg(clk, IFDivStartE, W64E, W64M); flopen #(1) altbreg(clk, IFDivStartE, ALTBE, ALTBM); flopen #(1) negquotreg(clk, IFDivStartE, NegQuotE, NegQuotM); - flopen #(1) azeroreg(clk, IFDivStartE, AZeroE, AZeroM); flopen #(1) bzeroreg(clk, IFDivStartE, BZeroE, BZeroM); flopen #(1) asignreg(clk, IFDivStartE, AsE, AsM); flopen #(`DIVBLEN+1) nreg(clk, IFDivStartE, nE, nM); diff --git a/pipelined/src/ieu/forward.sv b/pipelined/src/ieu/forward.sv index 701a7d43..ff576cf3 100644 --- a/pipelined/src/ieu/forward.sv +++ b/pipelined/src/ieu/forward.sv @@ -60,6 +60,6 @@ module forward( assign MatchDE = ((Rs1D == RdE) | (Rs2D == RdE)) & (RdE != 5'b0); // Decode-stage instruction source depends on result from execute stage instruction assign FCvtIntStallD = FCvtIntE & MatchDE; // FPU to Integer transfers have single-cycle latency except fcvt assign LoadStallD = (MemReadE|SCE) & MatchDE; - assign MDUStallD = MDUE & MatchDE; + assign MDUStallD = MDUE & MatchDE; // Int mult/div is at least two cycle latency, even when coming from the FDIV assign CSRRdStallD = CSRReadE & MatchDE; endmodule