diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv index 19679aa55..9af93fb37 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv @@ -55,7 +55,6 @@ module fdivsqrt( // output logic [`XLEN-1:0] RemM, ); - logic [`DIVb+3:0] NextWSN, NextWCN; logic [`DIVb+3:0] WS, WC; logic [`DIVb+3:0] X; logic [`DIVN-2:0] D; // U0.N-1 @@ -77,7 +76,7 @@ module fdivsqrt( .XInfE, .YInfE, .WZero, .SpecialCaseM); fdivsqrtiter fdivsqrtiter( .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, .SqrtM, - .X,.Dpreproc, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, + .X,.Dpreproc, .FirstWS(WS), .FirstWC(WC), .DivStartE, .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE, .DivBusy); fdivsqrtpostproc fdivsqrtpostproc(.WS, .WC, .D, .FirstU, .FirstUM, .FirstC, .Firstun, .SqrtM, .SpecialCaseM, .QmM, .WZero, .DivSM); diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv index d13d706f4..5c067796c 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv @@ -41,7 +41,6 @@ module fdivsqrtiter( input logic [`DIVb+3:0] X, input logic [`DIVN-2:0] Dpreproc, output logic [`DIVN-2:0] D, // U0.N-1 - output logic [`DIVb+3:0] NextWSN, NextWCN, output logic [`DIVb:0] FirstU, FirstUM, output logic [`DIVb+1:0] FirstC, output logic Firstun, @@ -56,12 +55,12 @@ module fdivsqrtiter( // U/UM should be 1.b so b+1 bits or b:0 // C needs to be the lenght of the final fraction 0.b so b or b-1:0 /* verilator lint_off UNOPTFLAT */ - logic [`DIVb+3:0] WSA[`DIVCOPIES-1:0]; // Q4.b - logic [`DIVb+3:0] WCA[`DIVCOPIES-1:0]; // Q4.b - logic [`DIVb+3:0] WS[`DIVCOPIES-1:0]; // Q4.b - logic [`DIVb+3:0] WC[`DIVCOPIES-1:0]; // Q4.b - logic [`DIVb:0] U[`DIVCOPIES-1:0]; // U1.b - logic [`DIVb:0] UM[`DIVCOPIES-1:0];// 1.b + logic [`DIVb+3:0] WSNext[`DIVCOPIES-1:0]; // Q4.b + logic [`DIVb+3:0] WCNext[`DIVCOPIES-1:0]; // Q4.b + logic [`DIVb+3:0] WS[`DIVCOPIES:0]; // Q4.b + logic [`DIVb+3:0] WC[`DIVCOPIES:0]; // Q4.b + logic [`DIVb:0] U[`DIVCOPIES:0]; // U1.b + logic [`DIVb:0] UM[`DIVCOPIES:0];// 1.b logic [`DIVb:0] UNext[`DIVCOPIES-1:0];// U1.b logic [`DIVb:0] UMNext[`DIVCOPIES-1:0];// U1.b logic [`DIVb+1:0] C[`DIVCOPIES:0]; // Q2.b @@ -79,31 +78,35 @@ module fdivsqrtiter( // Top Muxes and Registers // When start is asserted, the inputs are loaded into the divider. - // Otherwise, the divisor is retained and the partial remainder - // is fed back for the next iteration. - // - when the start signal is asserted X and 0 are loaded into WS and WC - // - otherwise load WSA into the flipflop - // - the assumed one is added to D since it's always normalized (and X/0 is a special case handeled by result selection) - // - XZeroE is used as the assumed one to avoid creating a sticky bit - all other numbers are normalized - assign NextWSN = WSA[`DIVCOPIES-1] << `LOGR; - assign NextWCN = WCA[`DIVCOPIES-1] << `LOGR; - - // Initialize C to -1 for sqrt and -R for division - logic [1:0] initCSqrt, initCDiv2, initCDiv4, initCUpper; - assign initCSqrt = 2'b11; // -1 - assign initCDiv2 = 2'b10; // -2 - assign initCDiv4 = 2'b00; // -4 - assign initCUpper = SqrtE ? initCSqrt : (`RADIX == 4) ? initCDiv4 : initCDiv2; - assign initC = {initCUpper, {`DIVb{1'b0}}}; - - mux2 #(`DIVb+4) wsmux(NextWSN, X, DivStartE, WSN); + // Otherwise, the divisor is retained and the residual and result + // are fed back for the next iteration. + + // Residual WS/SC registers/initializaiton mux + mux2 #(`DIVb+4) wsmux(WS[`DIVCOPIES], X, DivStartE, WSN); + mux2 #(`DIVb+4) wcmux(WC[`DIVCOPIES], '0, DivStartE, WCN); flopen #(`DIVb+4) wsflop(clk, DivStartE|DivBusy, WSN, WS[0]); - mux2 #(`DIVb+4) wcmux(NextWCN, '0, DivStartE, WCN); flopen #(`DIVb+4) wcflop(clk, DivStartE|DivBusy, WCN, WC[0]); - flopen #(`DIVN-1) dflop(clk, DivStartE, Dpreproc, D); + + // UOTFC Result U and UM registers/initialization mux + // Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 for division + assign initU = SqrtE ? {1'b1, {(`DIVb){1'b0}}} : 0; + assign initUM = SqrtE ? 0 : {1'b1, {(`DIVb){1'b0}}}; + mux2 #(`DIVb+1) Umux(UNext[`DIVCOPIES-1], initU, DivStartE, UMux); + mux2 #(`DIVb+1) UMmux(UMNext[`DIVCOPIES-1], initUM, DivStartE, UMMux); + flopen #(`DIVb+1) UReg(clk, DivStartE|DivBusy, UMux, U[0]); + flopen #(`DIVb+1) UMReg(clk, DivStartE|DivBusy, UMMux, UM[0]); + + // C register/initialization mux + // Initialize C to -1 for sqrt and -R for division + logic [1:0] initCUpper; + assign initCUpper = SqrtE ? 2'b11 : (`RADIX == 4) ? 2'b00 : 2'b10; + assign initC = {initCUpper, {`DIVb{1'b0}}}; mux2 #(`DIVb+2) Cmux(C[`DIVCOPIES], initC, DivStartE, CMux); flopen #(`DIVb+2) cflop(clk, DivStartE|DivBusy, CMux, C[0]); + // Divisior register + flopen #(`DIVN-1) dflop(clk, DivStartE, Dpreproc, D); + // Divisor Selections // - choose the negitive version of what's being selected // - D is only the fraction @@ -113,37 +116,29 @@ module fdivsqrtiter( assign D2 = {2'b0, 1'b1, D, {`DIVb+2-`DIVN{1'b0}}}; end + // k=DIVCOPIES of the recurrence logic genvar i; generate for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : interations if (`RADIX == 2) begin: stage fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .SqrtM, - .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), + .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i])); end else begin: stage logic j1; assign j1 = (i == 0 & ~C[0][`DIVb-1]); fdivsqrtstage4 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtM, .j1, - .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), + .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i])); end - if(i<(`DIVCOPIES-1)) begin - assign WS[i+1] = WSA[i] << `LOGR; - assign WC[i+1] = WCA[i] << `LOGR; - assign U[i+1] = UNext[i]; - assign UM[i+1] = UMNext[i]; - end + assign WS[i+1] = WSNext[i]; + assign WC[i+1] = WCNext[i]; + assign U[i+1] = UNext[i]; + assign UM[i+1] = UMNext[i]; end endgenerate - // Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 for division - assign initU = SqrtE ? {1'b1, {(`DIVb){1'b0}}} : 0; - assign initUM = SqrtE ? 0 : {1'b1, {(`DIVb){1'b0}}}; - mux2 #(`DIVb+1) Umux(UNext[`DIVCOPIES-1], initU, DivStartE, UMux); - mux2 #(`DIVb+1) UMmux(UMNext[`DIVCOPIES-1], initUM, DivStartE, UMMux); - flopen #(`DIVb+1) UReg(clk, DivStartE|DivBusy, UMux, U[0]); - flopen #(`DIVb+1) UMReg(clk, DivStartE|DivBusy, UMMux, UM[0]); - + // Send values from start of cycle for postprocessing assign FirstWS = WS[0]; assign FirstWC = WC[0]; assign FirstU = U[0]; diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv index 4379724ff..73f4e4425 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv @@ -31,19 +31,18 @@ `include "wally-config.vh" module fdivsqrtqsel4 ( - input logic [`DIVN-2:0] D, + input logic [2:0] Dmsbs, input logic [4:0] Smsbs, - input logic [`DIVb+3:0] WS, WC, + input logic [7:0] WSmsbs, WCmsbs, input logic Sqrt, j1, output logic [3:0] udigit ); logic [6:0] Wmsbs; logic [7:0] PreWmsbs; - logic [2:0] Dmsbs, A; + logic [2:0] A; - assign PreWmsbs = WC[`DIVb+3:`DIVb-4] + WS[`DIVb+3:`DIVb-4]; + assign PreWmsbs = WCmsbs + WSmsbs; assign Wmsbs = PreWmsbs[7:1]; - assign Dmsbs = D[`DIVN-2:`DIVN-4];//|{3{D[`DIVN-2]&Sqrt}}; // D = 0001.xxx... // Dmsbs = | | // W = xxxx.xxx... @@ -51,6 +50,7 @@ module fdivsqrtqsel4 ( logic [3:0] USel4[1023:0]; + // Prepopulate selection table; this is constant at compile time always_comb begin integer a, w, i, w2; for(a=0; a<8; a++) @@ -101,12 +101,15 @@ module fdivsqrtqsel4 ( endcase end end + + // Select A always_comb if (Sqrt) begin if (j1) A = 3'b101; else if (Smsbs == 5'b10000) A = 3'b111; else A = Smsbs[2:0]; end else A = Dmsbs; + + // Select quotient digit from lookup table based on A and W assign udigit = USel4[{A,Wmsbs}]; - endmodule diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv new file mode 100644 index 000000000..de4c22a18 --- /dev/null +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv @@ -0,0 +1,93 @@ +/////////////////////////////////////////// +// fdivsqrtqsel4cmp.sv +// +// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu +// Modified:13 January 2022 +// +// Purpose: Comparator-based Radix 4 Quotient Digit Selection +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// MIT LICENSE +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons +// to whom the Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +// PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +// OR OTHER DEALINGS IN THE SOFTWARE. +//////////////////////////////////////////////////////////////////////////////////////////////// + +`include "wally-config.vh" + +module fdivsqrtqsel4cmp ( + input logic [2:0] Dmsbs, + input logic [4:0] Smsbs, + input logic [7:0] WSmsbs, WCmsbs, + input logic Sqrt, j1, + output logic [3:0] udigit +); + logic [6:0] Wmsbs; + logic [7:0] PreWmsbs; + logic [2:0] A; + + assign PreWmsbs = WCmsbs + WSmsbs; + assign Wmsbs = PreWmsbs[7:1]; + // D = 0001.xxx... + // Dmsbs = | | + // W = xxxx.xxx... + // Wmsbs = | | + + logic [6:0] mk2, mk1, mk0, mkm1; + logic [6:0] mks2[7:0], mks1[7:0]; + + // Prepopulate table of mks0 + assign mks2[0] = 12; + assign mks2[1] = 14; + assign mks2[2] = 16; + assign mks2[3] = 17; + assign mks2[4] = 18; + assign mks2[5] = 20; + assign mks2[6] = 22; + assign mks2[7] = 23; + assign mks1[0] = 4; + assign mks1[1] = 4; + assign mks1[2] = 6; + assign mks1[3] = 6; + assign mks1[4] = 6; + assign mks1[5] = 8; // is the logic any cheaper if this is a 6? + assign mks1[6] = 8; + assign mks1[7] = 8; + + // Choose A for current operation + always_comb + if (Sqrt) begin + if (j1) A = 3'b101; + else if (Smsbs == 5'b10000) A = 3'b111; + else A = Smsbs[2:0]; + end else A = Dmsbs; + + // Choose selection constants based on a + assign mk2 = mks2[A]; + assign mk1 = mks1[A]; + assign mk0 = -mks1[A]; + assign mkm1 = (A == 3'b000) ? -13 : -mks2[A]; // asymmetry in table + + // Compare residual W to selection constants to choose digit + always_comb + if ($signed(Wmsbs) >= $signed(mk2)) udigit = 4'b1000; // choose 2 + else if ($signed(Wmsbs) >= $signed(mk1)) udigit = 4'b0100; // choose 1 + else if ($signed(Wmsbs) >= $signed(mk0)) udigit = 4'b0000; // choose 0 + else if ($signed(Wmsbs) >= $signed(mkm1)) udigit = 4'b0010; // choose -1 + else udigit = 4'b0001; // choose -2 +endmodule diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv index 987f23576..8ed1664af 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv @@ -41,7 +41,7 @@ module fdivsqrtstage2 ( output logic un, output logic [`DIVb+1:0] CNext, output logic [`DIVb:0] UNext, UMNext, - output logic [`DIVb+3:0] WSA, WCA + output logic [`DIVb+3:0] WSNext, WCNext ); /* verilator lint_on UNOPTFLAT */ @@ -49,8 +49,7 @@ module fdivsqrtstage2 ( logic up, uz; logic [`DIVb+3:0] F; logic [`DIVb+3:0] AddIn; - - assign CNext = {1'b1, C[`DIVb+1:1]}; + logic [`DIVb+3:0] WSA, WCA; // Qmient Selection logic // Given partial remainder, select digit of +1, 0, or -1 (up, uz, un) @@ -61,8 +60,11 @@ module fdivsqrtstage2 ( // 0010 = -1 // 0001 = -2 fdivsqrtqsel2 qsel2(WS[`DIVb+3:`DIVb], WC[`DIVb+3:`DIVb], up, uz, un); + + // Sqrt F generatin fdivsqrtfgen2 fgen2(.up, .uz, .C(CNext), .U, .UM, .F); + // Divisor multiple always_comb if (up) Dsel = DBar; else if (uz) Dsel = '0; // qz @@ -72,7 +74,13 @@ module fdivsqrtstage2 ( // WSA, WCA = WS + WC - qD assign AddIn = SqrtM ? F : Dsel; csa #(`DIVb+4) csa(WS, WC, AddIn, up&~SqrtM, WSA, WCA); + assign WSNext = WSA << 1; + assign WCNext = WCA << 1; + // Shift thermometer code C + assign CNext = {1'b1, C[`DIVb+1:1]}; + + // Unified On-The-Fly Converter to accumulate result fdivsqrtuotfc2 uotfc2(.up, .uz, .C(CNext), .U, .UM, .UNext, .UMNext); endmodule diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv index e463762a2..05792293c 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv @@ -30,7 +30,6 @@ `include "wally-config.vh" -/* verilator lint_off UNOPTFLAT */ module fdivsqrtstage4 ( input logic [`DIVN-2:0] D, input logic [`DIVb+3:0] DBar, D2, DBar2, @@ -41,17 +40,18 @@ module fdivsqrtstage4 ( input logic SqrtM, j1, output logic un, output logic [`DIVb:0] UNext, UMNext, - output logic [`DIVb+3:0] WSA, WCA + output logic [`DIVb+3:0] WSNext, WCNext ); - /* verilator lint_on UNOPTFLAT */ logic [`DIVb+3:0] Dsel; logic [3:0] udigit; logic [`DIVb+3:0] F; logic [`DIVb+3:0] AddIn; logic [4:0] Smsbs; + logic [2:0] Dmsbs; + logic [7:0] WCmsbs, WSmsbs; logic CarryIn; - assign CNext = {2'b11, C[`DIVb+1:2]}; + logic [`DIVb+3:0] WSA, WCA; // Digit Selection logic // u encoding: @@ -61,28 +61,40 @@ module fdivsqrtstage4 ( // 0010 = -1 // 0001 = -2 assign Smsbs = U[`DIVb:`DIVb-4]; - fdivsqrtqsel4 qsel4(.D, .Smsbs, .WS, .WC, .Sqrt(SqrtM), .j1, .udigit); + assign Dmsbs = D[`DIVN-2:`DIVN-4]; + assign WCmsbs = WC[`DIVb+3:`DIVb-4]; + assign WSmsbs = WS[`DIVb+3:`DIVb-4]; + + fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .Sqrt(SqrtM), .j1, .udigit); + assign un = 0; // unused for radix 4 + + // F generation logic fdivsqrtfgen4 fgen4(.udigit, .C({2'b11, CNext}), .U({3'b000, U}), .UM({3'b000, UM}), .F); + // Divisor multiple logic always_comb - case (udigit) - 4'b1000: Dsel = DBar2; - 4'b0100: Dsel = DBar; - 4'b0000: Dsel = '0; - 4'b0010: Dsel = {3'b0, 1'b1, D, {`DIVb-`DIVN+1{1'b0}}}; - 4'b0001: Dsel = D2; - default: Dsel = 'x; - endcase + case (udigit) + 4'b1000: Dsel = DBar2; + 4'b0100: Dsel = DBar; + 4'b0000: Dsel = '0; + 4'b0010: Dsel = {3'b0, 1'b1, D, {`DIVb-`DIVN+1{1'b0}}}; + 4'b0001: Dsel = D2; + default: Dsel = 'x; + endcase - // Partial Product Generation - // WSA, WCA = WS + WC - qD + // Residual Update + // {WS, WC}}Next = (WS + WC - qD or F) << 2 assign AddIn = SqrtM ? F : Dsel; assign CarryIn = ~SqrtM & (udigit[3] | udigit[2]); // +1 for 2's complement of -D and -2D csa #(`DIVb+4) csa(WS, WC, AddIn, CarryIn, WSA, WCA); - - fdivsqrtuotfc4 fdivsqrtuotfc4(.udigit, .Sqrt(SqrtM), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext); + assign WSNext = WSA << 2; + assign WCNext = WCA << 2; - assign un = 0; // unused for radix 4 + // Shift thermometer code C + assign CNext = {2'b11, C[`DIVb+1:2]}; + + // On-the-fly converter to accumulate result + fdivsqrtuotfc4 fdivsqrtuotfc4(.udigit, .Sqrt(SqrtM), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext); endmodule