diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv index d234144c..4df7a147 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv @@ -120,15 +120,15 @@ module fdivsqrtiter( // k=DIVCOPIES of the recurrence logic genvar i; generate - for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : interations + for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : iterations if (`RADIX == 2) begin: stage - fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .SqrtM, - .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), + fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .SqrtM, .OTFCSwap, + .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i])); end else begin: stage logic j1; assign j1 = (i == 0 & ~C[0][`DIVb-1]); - fdivsqrtstage4 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtM, .j1, + fdivsqrtstage4 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtM, .j1, .OTFCSwap, .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i])); end @@ -142,9 +142,9 @@ module fdivsqrtiter( // Send values from start of cycle for postprocessing assign FirstWS = WS[0]; assign FirstWC = WC[0]; - assign FirstU = U[0]; + assign FirstU = U[0]; assign FirstUM = UM[0]; - assign FirstC = C[0]; + assign FirstC = C[0]; assign Firstun = un[0]; endmodule diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel2.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel2.sv index 8a3fc659..ae927c97 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel2.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel2.sv @@ -32,11 +32,13 @@ module fdivsqrtqsel2 ( input logic [3:0] ps, pc, - output logic up, uz, un + input logic swap, + output logic up, uz, un ); logic [3:0] p, g; - logic magnitude, sign, cout; + logic magnitude, sign, cout; + logic pos, neg; // The quotient selection logic is presented for simplicity, not // for efficiency. You can probably optimize your logic to @@ -57,7 +59,11 @@ module fdivsqrtqsel2 ( (ps[0]&pc[0]))))); // Produce digit = +1, 0, or -1 - assign up = magnitude & ~sign; - assign uz = ~magnitude; - assign un = magnitude & sign; + assign pos = magnitude & ~sign; + assign uz = ~magnitude; + assign neg = magnitude & sign; + + // Check for swap (int div only) + assign un = swap ? pos : neg; + assign up = swap ? neg : pos; endmodule diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv index de4c22a1..b7bcf949 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv @@ -34,12 +34,13 @@ module fdivsqrtqsel4cmp ( input logic [2:0] Dmsbs, input logic [4:0] Smsbs, input logic [7:0] WSmsbs, WCmsbs, - input logic Sqrt, j1, + input logic Sqrt, j1, OTFCSwap, output logic [3:0] udigit ); logic [6:0] Wmsbs; logic [7:0] PreWmsbs; logic [2:0] A; + logic [3:0] udigitsel, udigitswap; assign PreWmsbs = WCmsbs + WSmsbs; assign Wmsbs = PreWmsbs[7:1]; @@ -85,9 +86,12 @@ module fdivsqrtqsel4cmp ( // Compare residual W to selection constants to choose digit always_comb - if ($signed(Wmsbs) >= $signed(mk2)) udigit = 4'b1000; // choose 2 - else if ($signed(Wmsbs) >= $signed(mk1)) udigit = 4'b0100; // choose 1 - else if ($signed(Wmsbs) >= $signed(mk0)) udigit = 4'b0000; // choose 0 - else if ($signed(Wmsbs) >= $signed(mkm1)) udigit = 4'b0010; // choose -1 - else udigit = 4'b0001; // choose -2 + if ($signed(Wmsbs) >= $signed(mk2)) udigitsel = 4'b1000; // choose 2 + else if ($signed(Wmsbs) >= $signed(mk1)) udigitsel = 4'b0100; // choose 1 + else if ($signed(Wmsbs) >= $signed(mk0)) udigitsel = 4'b0000; // choose 0 + else if ($signed(Wmsbs) >= $signed(mkm1)) udigitsel = 4'b0010; // choose -1 + else udigitsel = 4'b0001; // choose -2 + + assign udigitswap = {udigitsel[0], udigitsel[1], udigitsel[2], udigitsel[3]}; + assign udigit = OTFCSwap ? udigitswap : udigitsel; endmodule diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv index 09f82da8..8d78ccd5 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv @@ -32,12 +32,13 @@ /* verilator lint_off UNOPTFLAT */ module fdivsqrtstage2 ( - input logic [`DIVN-2:0] D, - input logic [`DIVb+3:0] DBar, - input logic [`DIVb:0] U, UM, - input logic [`DIVb+3:0] WS, WC, - input logic [`DIVb+1:0] C, - input logic SqrtM, + input logic [`DIVN-2:0] D, + input logic [`DIVb+3:0] DBar, + input logic [`DIVb:0] U, UM, + input logic [`DIVb+3:0] WS, WC, + input logic [`DIVb+1:0] C, + input logic SqrtM, + input logic OTFCSwap, output logic un, output logic [`DIVb+1:0] CNext, output logic [`DIVb:0] UNext, UMNext, @@ -59,7 +60,7 @@ module fdivsqrtstage2 ( // 0000 = 0 // 0010 = -1 // 0001 = -2 - fdivsqrtqsel2 qsel2(WS[`DIVb+3:`DIVb], WC[`DIVb+3:`DIVb], up, uz, un); + fdivsqrtqsel2 qsel2(WS[`DIVb+3:`DIVb], WC[`DIVb+3:`DIVb], OTFCSwap, up, uz, un); // Sqrt F generation fdivsqrtfgen2 fgen2(.up, .uz, .C(CNext), .U, .UM, .F); diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv index 05792293..92e8f55d 100644 --- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv +++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv @@ -31,13 +31,13 @@ `include "wally-config.vh" module fdivsqrtstage4 ( - input logic [`DIVN-2:0] D, - input logic [`DIVb+3:0] DBar, D2, DBar2, - input logic [`DIVb:0] U, UM, - input logic [`DIVb+3:0] WS, WC, - input logic [`DIVb+1:0] C, + input logic [`DIVN-2:0] D, + input logic [`DIVb+3:0] DBar, D2, DBar2, + input logic [`DIVb:0] U, UM, + input logic [`DIVb+3:0] WS, WC, + input logic [`DIVb+1:0] C, + input logic SqrtM, j1, OTFCSwap, output logic [`DIVb+1:0] CNext, - input logic SqrtM, j1, output logic un, output logic [`DIVb:0] UNext, UMNext, output logic [`DIVb+3:0] WSNext, WCNext @@ -65,8 +65,8 @@ module fdivsqrtstage4 ( assign WCmsbs = WC[`DIVb+3:`DIVb-4]; assign WSmsbs = WS[`DIVb+3:`DIVb-4]; - fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .Sqrt(SqrtM), .j1, .udigit); - assign un = 0; // unused for radix 4 + fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .Sqrt(SqrtM), .j1, .udigit, .OTFCSwap); + assign un = 1'b0; // unused for radix 4 // F generation logic fdivsqrtfgen4 fgen4(.udigit, .C({2'b11, CNext}), .U({3'b000, U}), .UM({3'b000, UM}), .F); diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h index ca197876..ac3d81c3 100644 --- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h +++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h @@ -1081,8 +1081,9 @@ uart_read_LSR_IIR: bne a4, t6, uart_read_LSR_IIR j uart_data_ready uart_rxfifo_timout: - li t4, 0x10000000 // read from the fifo + li t4, 0x10000000 // read from the fifo to clear the rx timeout error lb t5, 0(t4) + sb t5, 0(t4) // write back to the fifo to make sure we have the same data so expected future overrun errors still occur. //read the fifo until empty j uart_read_LSR_IIR @@ -1090,6 +1091,7 @@ uart_rxfifo_timout: uart_data_ready: li t2, 0 sw t2, 0(t1) // clear entry deadbeef from memory + lbu t4, 0(t3) // re read IIR andi t5, t5, 0x9F // mask THRE and TEMT from signature sb t4, 1(t1) // IIR sb t5, 0(t1) // LSR