/////////////////////////////////////////// // mul.sv // // Written: James.Stine@okstate.edu 1 February 2021 // Modified: // // Purpose: Integer Divide instructions // // A component of the Wally configurable RISC-V project. // // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University // // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software // is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// // *** I added these verilator controls to clean up the // lint output. The linter warnings should be fixed, but now the output is at // least readable. /* verilator lint_off COMBDLY */ /* verilator lint_off IMPLICIT */ `include "wally-config.vh" module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); input logic [63:0] N, D; input logic clk; input logic reset; input logic start; input logic S; output logic [63:0] Qf; output logic [63:0] remf; output logic div0; output logic done; output logic divBusy; logic divdone; logic enable; logic state0; logic V; logic [7:0] Num; logic [5:0] P, NumIter, RemShift; logic [63:0] op1, op2, op1shift, Rem5; logic [64:0] Qd, Rd, Qd2, Rd2; logic [63:0] Q, rem0; logic [3:0] quotient; logic otfzero; logic shiftResult; logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; logic [63:0] twoD; logic [63:0] twoN; logic SignD; logic SignN; logic [63:0] QT, remT; logic D_NegOne; logic Max_N; // Check if negative (two's complement) // If so, convert to positive adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD); adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN); assign SignD = D[63]; assign SignN = N[63]; // Max N and D = -1 (Overflow) assign Max_N = (~|N[62:0]) & N[63]; assign D_NegOne = &D; // Divider goes the distance to 37 cycles // (thanks to the evil divisor for D = 0x1) // Shift D, if needed (for integer) // needed to allow qst to be in range for integer // division [1,2) and allow integer divide to work. // // The V or valid bit can be used to determine if D // is 0 and thus a divide by 0 exception. This div0 // exception is given to FSM to tell the operation to // quit gracefully. lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD)); shift_left #(64) p2 (twoD, P, op2); assign op1 = twoN; assign div0 = ~V; // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0) // v = 2 since \rho < 1 (add 4 to make sure its a ceil) adder #(8) cpa3 ({2'b0, P}, {5'h0, shiftResult, ~shiftResult, 1'b0}, Num); // Determine whether need to add just Q/Rem assign shiftResult = P[0]; // div by 2 (ceil) assign NumIter = Num[6:1]; assign RemShift = P; // FSM to control integer divider // assume inputs are postive edge and // datapath (divider) is negative edge fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv, start, div0, NumIter, ~clk, reset); flopr #(1) rega (~clk, reset, donev, done); flopr #(1) regb (~clk, reset, divdonev, divdone); flopr #(1) regc (~clk, reset, otfzerov, otfzero); flopr #(1) regd (~clk, reset, enablev, enable); flopr #(1) rege (~clk, reset, state0v, state0); flopr #(1) regf (~clk, reset, divBusyv, divBusy); // To obtain a correct remainder the last bit of the // quotient has to be aligned with a radix-r boundary. // Since the quotient is in the range 1/2 < q < 2 (one // integer bit and m fractional bits), this is achieved by // shifting N right by v+s so that (m+v+s) mod k = 0. And, // the quotient has to be aligned to the integer position. divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, enable, otfzero, shiftResult); // Storage registers to hold contents stable flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2); flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2); // Probably not needed - just assigns results assign Q = Qd2[63:0]; assign Rem5 = Rd2[64:1]; // Adjust remainder by m shift_right #(64) p4 (Rem5, RemShift, rem0); // Adjust Q/Rem for Signed assign tcQ = (SignN ^ SignD) & S; assign tcR = SignN & S; // Signed Divide // - When N and D are negative: Remainder is negative (undergoes a two's complement). // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement). // - When D is negative: Quotient is negative (undergoes a two's complement). adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT); adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT); // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec) exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf); endmodule // int32div module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, enable, otfzero, shiftResult); input logic [63:0] op1, op2; input logic clk, state0; input logic reset; input logic enable; input logic otfzero; input logic shiftResult; output logic [64:0] rem0; output logic [64:0] Q; output logic [3:0] quotient; logic [67:0] Sum, Carry; logic [64:0] Qstar; logic [64:0] QMstar; logic [7:0] qtotal; logic [67:0] SumN, CarryN, SumN2, CarryN2; logic [67:0] divi1, divi2, divi1c, divi2c, dive1; logic [67:0] mdivi_temp, mdivi; logic zero; logic [1:0] qsel; logic [1:0] Qin, QMin; logic CshiftQ, CshiftQM; logic [67:0] rem1, rem2, rem3; logic [67:0] SumR, CarryR; logic [64:0] Qt; // Create one's complement values of Divisor (for q*D) assign divi1 = {3'h0, op2, 1'b0}; assign divi2 = {2'h0, op2, 2'b0}; assign divi1c = ~divi1; assign divi2c = ~divi2; // Shift x1 if not mod k mux2 #(68) mx1 ({3'b000, op1, 1'b0}, {4'h0, op1}, shiftResult, dive1); // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D) mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN); mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN); // Simplify QST adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal); // q = {+2, +1, -1, -2} else q = 0 qst4 pd1 (qtotal[7:1], divi1[63:61], quotient); assign ulp = quotient[2]|quotient[3]; assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]); // Map to binary encoding assign qsel[1] = quotient[3]|quotient[2]; assign qsel[0] = quotient[3]|quotient[1]; mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp); mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi); csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry); // regs : save CSA flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2); flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2); // OTF ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM); otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, otfzero, enable, Qstar, QMstar); // Correction and generation of Remainder adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1); // Add back +D as correction csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR); adder #(68) cpa3 (SumR, CarryR, rem2); // Choose remainder (Rem or Rem+D) mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3); // Choose correct Q or QM mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt); // Final results assign rem0 = rem3[64:0]; assign Q = Qt; endmodule // divide4x64 module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM); input logic [3:0] quot; output logic [1:0] Qin; output logic [1:0] QMin; output logic CshiftQ; output logic CshiftQM; // Load/Store Control for OTF assign Qin[1] = (quot[1]) | (quot[3]) | (quot[0]); assign Qin[0] = (quot[1]) | (quot[2]); assign QMin[1] = (quot[1]) | (!quot[3]&!quot[2]&!quot[1]&!quot[0]); assign QMin[0] = (quot[3]) | (quot[0]) | (!quot[3]&!quot[2]&!quot[1]&!quot[0]); assign CshiftQ = (quot[1]) | (quot[0]); assign CshiftQM = (quot[3]) | (quot[2]); endmodule // On-the-fly Conversion per Ercegovac/Lang module otf #(parameter WIDTH=8) (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q); input logic [1:0] Qin, QMin; input logic CshiftQ, CshiftQM; input logic clk; input logic reset; input logic enable; output logic [WIDTH-1:0] R2Q; output logic [WIDTH-1:0] R1Q; logic [WIDTH-1:0] Qstar, QMstar; logic [WIDTH-1:0] M1Q, M2Q; // QM mux2 #(WIDTH) m1 (QMstar, Qstar, CshiftQM, M1Q); flopenr #(WIDTH) r1 (clk, reset, enable, {M1Q[WIDTH-3:0], QMin}, R1Q); // Q mux2 #(WIDTH) m2 (Qstar, QMstar, CshiftQ, M2Q); flopenr #(WIDTH) r2 (clk, reset, enable, {M2Q[WIDTH-3:0], Qin}, R2Q); assign Qstar = R2Q; assign QMstar = R1Q; endmodule // otf8 module adder #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, output logic [WIDTH-1:0] y); assign y = a + b; endmodule // adder module fa (input logic a, b, c, output logic sum, carry); assign sum = a^b^c; assign carry = a&b|a&c|b&c; endmodule // fa module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c, output logic [WIDTH-1:0] sum, carry); logic [WIDTH:0] carry_temp; genvar i; generate for (i=0;i B. LT and GT are both '0' if A = B. module magcompare2b (LT, GT, A, B); input logic [1:0] A; input logic [1:0] B; output logic LT; output logic GT; // Determine if A < B using a minimized sum-of-products expression assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0]; // Determine if A > B using a minimized sum-of-products expression assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0]; endmodule // magcompare2b // J. E. Stine and M. J. Schulte, "A combined two's complement and // floating-point comparator," 2005 IEEE International Symposium on // Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. // doi: 10.1109/ISCAS.2005.1464531 module magcompare8 (LT, EQ, A, B); input logic [7:0] A; input logic [7:0] B; logic [3:0] s; logic [3:0] t; logic [1:0] u; logic [1:0] v; logic GT; //wire LT; output logic EQ; output logic LT; magcompare2b mag1 (s[0], t[0], A[1:0], B[1:0]); magcompare2b mag2 (s[1], t[1], A[3:2], B[3:2]); magcompare2b mag3 (s[2], t[2], A[5:4], B[5:4]); magcompare2b mag4 (s[3], t[3], A[7:6], B[7:6]); magcompare2b mag5 (u[0], v[0], t[1:0], s[1:0]); magcompare2b mag6 (u[1], v[1], t[3:2], s[3:2]); magcompare2b mag7 (LT, GT, v[1:0], u[1:0]); assign EQ = ~(GT | LT); endmodule // magcompare8 module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); input logic [63:0] Q; input logic [63:0] rem; input logic [63:0] op1; input logic S; input logic div0; input logic Max_N; input logic D_NegOne; output logic [63:0] Qf; output logic [63:0] remf; // Needs to be optimized always_comb case ({div0, S, Max_N, D_NegOne}) 4'b0000 : Qf = Q; 4'b0001 : Qf = Q; 4'b0010 : Qf = Q; 4'b0011 : Qf = Q; 4'b0100 : Qf = Q; 4'b0101 : Qf = Q; 4'b0110 : Qf = Q; 4'b0111 : Qf = {1'b1, 31'h0}; 4'b1000 : Qf = {64{1'b1}}; 4'b1001 : Qf = {64{1'b1}}; 4'b1010 : Qf = {64{1'b1}}; 4'b1011 : Qf = {64{1'b1}}; 4'b1100 : Qf = {64{1'b1}}; 4'b1101 : Qf = {64{1'b1}}; 4'b1110 : Qf = {64{1'b1}}; 4'b1111 : Qf = {64{1'b1}}; default: Qf = Q; endcase always_comb case ({div0, S, Max_N, D_NegOne}) 4'b0000 : remf = rem; 4'b0001 : remf = rem; 4'b0010 : remf = rem; 4'b0011 : remf = rem; 4'b0100 : remf = rem; 4'b0101 : remf = rem; 4'b0110 : remf = rem; 4'b0111 : remf = 64'h0; 4'b1000 : remf = op1; 4'b1001 : remf = op1; 4'b1010 : remf = op1; 4'b1011 : remf = op1; 4'b1100 : remf = op1; 4'b1101 : remf = op1; 4'b1110 : remf = op1; 4'b1111 : remf = op1; default: remf = rem; endcase endmodule // exception_int /* verilator lint_on COMBDLY */ /* verilator lint_on IMPLICIT */