FPU divider working with execute stage stall

This commit is contained in:
David Harris 2022-12-02 11:11:53 -08:00
parent a86c9de36b
commit db5f3c15a4
12 changed files with 74 additions and 44 deletions

View File

@ -43,13 +43,14 @@ module fdivsqrt(
input logic FDivStartE, IDivStartE,
input logic StallM,
input logic StallE,
input logic TrapM,
input logic SqrtE, SqrtM,
input logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
input logic [2:0] Funct3E, Funct3M,
input logic MDUE, W64E,
output logic DivSM,
output logic FDivBusyE,
output logic DivDone,
output logic FDivBusyE, DivStartE, FDivDoneE,
// output logic DivDone,
output logic [`NE+1:0] QeM,
output logic [`DIVb:0] QmM
// output logic [`XLEN-1:0] RemM,
@ -66,7 +67,6 @@ module fdivsqrt(
logic SpecialCaseM;
logic [`DIVBLEN:0] n, m;
logic OTFCSwap, ALTB, BZero, As;
logic DivStartE;
fdivsqrtpreproc fdivsqrtpreproc(
.clk, .DivStartE, .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE),
@ -75,11 +75,11 @@ module fdivsqrt(
.ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E);
fdivsqrtfsm fdivsqrtfsm(
.clk, .reset, .FmtE, .XsE, .SqrtE,
.FDivBusyE, .FDivStartE, .IDivStartE, .DivStartE, .StallE, .StallM, .DivDone, .XZeroE, .YZeroE,
.FDivBusyE, .FDivStartE, .IDivStartE, .DivStartE, .FDivDoneE, .StallE, .StallM, .TrapM, /*.DivDone, */ .XZeroE, .YZeroE,
.XNaNE, .YNaNE, .MDUE, .n,
.XInfE, .YInfE, .WZero, .SpecialCaseM);
fdivsqrtiter fdivsqrtiter(
.clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, .SqrtM,
.clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, // .SqrtM,
.X,.Dpreproc, .FirstWS(WS), .FirstWC(WC),
.DivStartE, .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE, .OTFCSwap,
.FDivBusyE);

View File

@ -42,12 +42,13 @@ module fdivsqrtfsm(
input logic SqrtE,
input logic StallE,
input logic StallM,
input logic TrapM,
input logic WZero,
input logic MDUE,
input logic [`DIVBLEN:0] n,
output logic DivStartE,
output logic DivDone,
output logic FDivBusyE,
// output logic DivDone,
output logic FDivBusyE, FDivDoneE,
output logic SpecialCaseM
);
@ -61,8 +62,10 @@ module fdivsqrtfsm(
// *** start logic is presently in fctl. Make it look more like integer division start logic
// DivStartE comes from fctrl, reflecitng the start of floating-point and possibly integer division
assign DivStartE = (FDivStartE | IDivStartE) & (state == IDLE) & ~StallM;
assign DivDone = (state == DONE) | (WZero & (state == BUSY)); // *** used in postprocess.sv and round.sv. This doesn't seem proper. They break when removed.
assign FDivBusyE = (state == BUSY & ~DivDone); // *** want to add | DivStartE but it creates comb loop
assign FDivDoneE = (state == DONE);
// assign DivDone = (state == DONE) | (WZero & (state == BUSY)); // *** used in postprocess.sv and round.sv. This doesn't seem proper. They break when removed.
//assign FDivBusyE = (state == BUSY & ~DivDone); // *** want to add | DivStartE but it creates comb loop
assign FDivBusyE = (state == BUSY) | DivStartE;
// Divider control signals from MDU
//assign DivBusyE = (state == BUSY) | DivStartE;
@ -110,6 +113,23 @@ module fdivsqrtfsm(
/* verilator lint_on WIDTH */
always_ff @(posedge clk) begin
if (reset | TrapM) begin
state <= #1 IDLE;
end else if (DivStartE) begin
step <= cycles;
if (SpecialCaseE) state <= #1 DONE;
else state <= #1 BUSY;
end else if (state == BUSY) begin
if (step == 1) state <= #1 DONE;
step <= step - 1;
end else if ((state == DONE) | (WZero & (state == BUSY))) begin
if (StallM) state <= #1 DONE;
else state <= #1 IDLE;
end
end
/*
always_ff @(posedge clk) begin
if (reset) begin
state <= #1 IDLE;
@ -129,6 +149,6 @@ module fdivsqrtfsm(
step <= step - 1;
end
end
*/
endmodule

View File

@ -37,7 +37,7 @@ module fdivsqrtiter(
input logic [`NE-1:0] Xe, Ye,
input logic XZeroE, YZeroE,
input logic SqrtE,
input logic SqrtM,
// input logic SqrtM,
input logic OTFCSwap,
input logic [`DIVb+3:0] X,
input logic [`DIVN-2:0] Dpreproc,
@ -85,8 +85,8 @@ module fdivsqrtiter(
// Residual WS/SC registers/initializaiton mux
mux2 #(`DIVb+4) wsmux(WS[`DIVCOPIES], X, DivStartE, WSN);
mux2 #(`DIVb+4) wcmux(WC[`DIVCOPIES], '0, DivStartE, WCN);
flopen #(`DIVb+4) wsflop(clk, DivStartE|FDivBusyE, WSN, WS[0]);
flopen #(`DIVb+4) wcflop(clk, DivStartE|FDivBusyE, WCN, WC[0]);
flopen #(`DIVb+4) wsflop(clk, FDivBusyE, WSN, WS[0]);
flopen #(`DIVb+4) wcflop(clk, FDivBusyE, WCN, WC[0]);
// UOTFC Result U and UM registers/initialization mux
// Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 for division
@ -122,13 +122,13 @@ module fdivsqrtiter(
generate
for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : iterations
if (`RADIX == 2) begin: stage
fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .SqrtM, .OTFCSwap,
fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .SqrtE, .OTFCSwap,
.WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
.C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
end else begin: stage
logic j1;
assign j1 = (i == 0 & ~C[0][`DIVb-1]);
fdivsqrtstage4 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtM, .j1, .OTFCSwap,
fdivsqrtstage4 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1, .OTFCSwap,
.WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
.C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
end

View File

@ -134,6 +134,7 @@ module fdivsqrtpostproc(
// division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
// *** Result is unused right now
assign Result = ($signed(PreResult) >>> NormShift) + {{(`DIVb+3){1'b0}}, (PostInc & ~RemOp)};
assign PreQmM = NegSticky ? FirstUM : FirstU; // Select U or U-1 depending on negative sticky bit

View File

@ -37,7 +37,7 @@ module fdivsqrtstage2 (
input logic [`DIVb:0] U, UM,
input logic [`DIVb+3:0] WS, WC,
input logic [`DIVb+1:0] C,
input logic SqrtM,
input logic SqrtE,
input logic OTFCSwap,
output logic un,
output logic [`DIVb+1:0] CNext,
@ -73,8 +73,8 @@ module fdivsqrtstage2 (
// Partial Product Generation
// WSA, WCA = WS + WC - qD
assign AddIn = SqrtM ? F : Dsel;
csa #(`DIVb+4) csa(WS, WC, AddIn, up&~SqrtM, WSA, WCA);
assign AddIn = SqrtE ? F : Dsel;
csa #(`DIVb+4) csa(WS, WC, AddIn, up&~SqrtE, WSA, WCA);
assign WSNext = WSA << 1;
assign WCNext = WCA << 1;

View File

@ -36,7 +36,7 @@ module fdivsqrtstage4 (
input logic [`DIVb:0] U, UM,
input logic [`DIVb+3:0] WS, WC,
input logic [`DIVb+1:0] C,
input logic SqrtM, j1, OTFCSwap,
input logic SqrtE, j1, OTFCSwap,
output logic [`DIVb+1:0] CNext,
output logic un,
output logic [`DIVb:0] UNext, UMNext,
@ -65,7 +65,7 @@ module fdivsqrtstage4 (
assign WCmsbs = WC[`DIVb+3:`DIVb-4];
assign WSmsbs = WS[`DIVb+3:`DIVb-4];
fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .Sqrt(SqrtM), .j1, .udigit, .OTFCSwap);
fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .Sqrt(SqrtE), .j1, .udigit, .OTFCSwap);
assign un = 1'b0; // unused for radix 4
// F generation logic
@ -84,8 +84,8 @@ module fdivsqrtstage4 (
// Residual Update
// {WS, WC}}Next = (WS + WC - qD or F) << 2
assign AddIn = SqrtM ? F : Dsel;
assign CarryIn = ~SqrtM & (udigit[3] | udigit[2]); // +1 for 2's complement of -D and -2D
assign AddIn = SqrtE ? F : Dsel;
assign CarryIn = ~SqrtE & (udigit[3] | udigit[2]); // +1 for 2's complement of -D and -2D
csa #(`DIVb+4) csa(WS, WC, AddIn, CarryIn, WSA, WCA);
assign WSNext = WSA << 2;
assign WCNext = WCA << 2;
@ -94,7 +94,7 @@ module fdivsqrtstage4 (
assign CNext = {2'b11, C[`DIVb+1:2]};
// On-the-fly converter to accumulate result
fdivsqrtuotfc4 fdivsqrtuotfc4(.udigit, .Sqrt(SqrtM), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext);
fdivsqrtuotfc4 fdivsqrtuotfc4(.udigit, .Sqrt(SqrtE), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext);
endmodule

View File

@ -38,6 +38,7 @@ module fpu (
input logic [`FLEN-1:0] ReadDataW, // Read data (from LSU)
input logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // Integer input (from IEU)
input logic StallE, StallM, StallW, // stall signals (from HZU)
input logic TrapM,
input logic FlushE, FlushM, FlushW, // flush signals (from HZU)
input logic [4:0] RdM, RdW, // which FP register to write to (from IEU)
input logic [1:0] STATUS_FS, // Is floating-point enabled? (From privileged unit)
@ -129,7 +130,8 @@ module fpu (
logic [`DIVb:0] QmM;
logic [`NE+1:0] QeE, QeM;
logic DivSE, DivSM;
logic DivDoneM;
// logic DivDoneM;
logic FDivDoneE, DivStartE;
// result and flag signals
logic [`XLEN-1:0] ClassResE; // classify result
@ -149,6 +151,7 @@ module fpu (
logic [`FLEN-1:0] AlignedSrcAE; // align SrcA to the floating point format
logic [`FLEN-1:0] BoxedZeroE; // Zero value for Z for multiplication, with NaN boxing if needed
logic [`FLEN-1:0] BoxedOneE; // Zero value for Z for multiplication, with NaN boxing if needed
logic EMRegEn;
// DECODE STAGE
@ -176,7 +179,7 @@ module fpu (
.a4(RdW), .wd4(FPUResultW),
.rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));
// D/E pipeline registers
// D/E pipeline registers
flopenrc #(`FLEN) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
flopenrc #(`FLEN) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
flopenrc #(`FLEN) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
@ -263,8 +266,8 @@ module fpu (
fdivsqrt fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
.XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .FDivStartE, .IDivStartE, .XsE,
.ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E,
.StallE, .StallM, .DivSM, .FDivBusyE, .QeM,
.QmM, .DivDone(DivDoneM));
.StallE, .StallM, .TrapM, .DivSM, .FDivBusyE, .DivStartE, .FDivDoneE, .QeM,
.QmM /*, .DivDone(DivDoneM) */);
//
// compare
@ -337,15 +340,20 @@ module fpu (
// E/M pipe registers
// flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, XE, FSrcXM);
flopenrc #(`NF+2) EMFpReg2 (clk, reset, FlushM, ~StallM, {XsE,XmE}, {XsM,XmM});
flopenrc #(`NF+2) EMFpReg3 (clk, reset, FlushM, ~StallM, {YsE,YmE}, {YsM,YmM});
assign EMRegEn = ~StallM & (~FDivBusyE & ~FDivDoneE | DivStartE);
// flopenrc #(64) EMFpReg1(clk, reset, FlushM, EMRegEn, XE, FSrcXM);
flopenrc #(`NF+1) EMFpReg2 (clk, reset, FlushM, ~StallM, XmE, XmM);
flopenrc #(`NF+1) EMFpReg3 (clk, reset, FlushM, ~StallM, YmE, YmM);
flopenrc #(`FLEN) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZeE,ZmE}, {ZeM,ZmM});
flopenrc #(`XLEN) EMFpReg6 (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
flopenrc #(`FLEN) EMFpReg7 (clk, reset, FlushM, ~StallM, PreFpResE, PreFpResM);
flopenrc #(13) EMFpReg5 (clk, reset, FlushM, ~StallM,
flopenr #(15) EMFpReg5 (clk, reset, EMRegEn,
{XsE, YsE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE, ZDenormE},
{XsM, YsM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM, ZDenormM});
/* flopenrc #(13) EMFpReg5 (clk, reset, FlushM, ~StallM,
{XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE, ZDenormE},
{XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM, ZDenormM});
{XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM, ZDenormM}); */
flopenrc #(1) EMRegCmpFlg (clk, reset, FlushM, ~StallM, PreNVE, PreNVM);
flopenrc #(3*`NF+6) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM);
flopenrc #(`NE+2) EMRegFma3(clk, reset, FlushM, ~StallM, PeE, PeM);
@ -372,7 +380,7 @@ module fpu (
postprocess postprocess(.Xs(XsM), .Ys(YsM), .Ze(ZeM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), .FmaPe(PeM),
.FmaZmS(ZmStickyM), .FmaKillProd(KillProdM), .XZero(XZeroM), .YZero(YZeroM), .ZZero(ZZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
.ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), .FmaSm(SmM), .DivQe(QeM), .DivDone(DivDoneM),
.ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), .FmaSm(SmM), .DivQe(QeM), /*.DivDone(DivDoneM), */
.FmaNegSum(NegSumM), .FmaInvA(InvAM), .ZDenorm(ZDenormM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
.CvtCe(CeM), .CvtResDenormUf(CvtResDenormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CsM), .ToInt(FWriteIntM), .DivS(DivSM),
.CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));

View File

@ -58,7 +58,7 @@ module postprocess (
input logic [$clog2(3*`NF+7)-1:0] FmaSCnt, // the normalization shift count
//divide signals
input logic DivS,
input logic DivDone,
// input logic DivDone,
input logic [`NE+1:0] DivQe,
input logic [`DIVb:0] DivQm,
// conversion signals
@ -129,7 +129,7 @@ module postprocess (
assign Mult = OpCtrl[2]&~OpCtrl[1]&~OpCtrl[0];
assign CvtOp = (PostProcSel == 2'b00);
assign FmaOp = (PostProcSel == 2'b10);
assign DivOp = (PostProcSel == 2'b01) & DivDone;
assign DivOp = (PostProcSel == 2'b01); // & DivDone;
assign Sqrt = OpCtrl[0];
// is there an input of infinity or NaN being used
@ -165,13 +165,13 @@ module postprocess (
ShiftIn = {CvtShiftIn, {`NORMSHIFTSZ-`CVTLEN-`NF-1{1'b0}}};
end
2'b01: begin //div
if(DivDone) begin
/* if(DivDone) begin */
ShiftAmt = DivShiftAmt;
ShiftIn = DivShiftIn;
end else begin
/* end else begin
ShiftAmt = '0;
ShiftIn = '0;
end
end */
end
default: begin
ShiftAmt = {`LOGNORMSHIFTSZ{1'bx}};
@ -201,7 +201,7 @@ module postprocess (
round round(.OutFmt, .Frm, .FmaZmS, .Plus1, .PostProcSel, .CvtCe, .Qe,
.Ms, .FmaMe, .FmaOp, .CvtOp, .CvtResDenormUf, .Mf, .ToInt, .CvtResUf,
.DivS, .DivDone,
.DivS, //.DivDone,
.DivOp, .UfPlus1, .FullRe, .Rf, .Re, .S, .R, .G, .Me);
///////////////////////////////////////////////////////////////////////////////

View File

@ -43,7 +43,7 @@ module round(
input logic DivOp,
input logic CvtOp,
input logic ToInt,
input logic DivDone,
// input logic DivDone,
input logic [1:0] PostProcSel,
input logic CvtResDenormUf,
input logic CvtResUf,
@ -295,7 +295,8 @@ module round(
case(PostProcSel)
2'b10: Me = FmaMe; // fma
2'b00: Me = {CvtCe[`NE], CvtCe}&{`NE+2{~CvtResDenormUf|CvtResUf}}; // cvt
2'b01: Me = DivDone ? Qe : '0; // divide
// 2'b01: Me = DivDone ? Qe : '0; // divide
2'b01: Me = Qe; // divide
default: Me = '0;
endcase

View File

@ -65,10 +65,10 @@ module hazard(
assign StallFCause = CSRWriteFencePendingDEM & ~(TrapM | RetM | BPPredWrongE);
// stall in decode if instruction is a load/mul/csr dependent on previous
assign StallDCause = (LoadStallD | StoreStallD | MDUStallD | CSRRdStallD | FPUStallD | FStallD) & ~(TrapM | RetM | BPPredWrongE);
assign StallECause = (DivBusyE) & ~(TrapM); // *** can we move to decode stage (KP?)
assign StallECause = (DivBusyE | FDivBusyE) & ~(TrapM); // *** can we move to decode stage (KP?)
// WFI terminates if any enabled interrupt is pending, even if global interrupts are disabled. It could also terminate with TW trap
assign StallMCause = ((wfiM) & (~TrapM & ~IntPendingM));
assign StallWCause = ((IFUStallF | LSUStallM) & ~TrapM) | (FDivBusyE & ~TrapM & ~IntPendingM);
assign StallWCause = ((IFUStallF | LSUStallM) & ~TrapM); // | (FDivBusyE & ~TrapM & ~IntPendingM);
// head version
// assign StallWCause = LSUStallM | IFUStallF | (FDivBusyE & ~TrapM & ~IntPendingM); // *** FDivBusyE should look like DivBusyE
// assign StallMCause = (wfiM & (~TrapM & ~IntPendingM)); // | FDivBusyE;

View File

@ -389,6 +389,7 @@ module wallypipelinedcore (
.ReadDataW(ReadDataW[`FLEN-1:0]),// Read data from memory
.ForwardedSrcAE, // Integer input being processed (from IEU)
.StallE, .StallM, .StallW, // stall signals from HZU
.TrapM,
.FlushE, .FlushM, .FlushW, // flush signals from HZU
.RdM, .RdW, // which FP register to write to (from IEU)
.STATUS_FS, // is floating-point enabled?

View File

@ -136,7 +136,6 @@ string tvpaths[] = '{
string imperas32f[] = '{
`IMPERASTEST,
"rv32i_m/F/FDIV-S-DYN-RDN-01",
"rv32i_m/F/FADD-S-DYN-RDN-01",
"rv32i_m/F/FADD-S-DYN-RMM-01",
"rv32i_m/F/FADD-S-DYN-RNE-01",