diff --git a/wally-pipelined/src/fpu/divconv.sv b/wally-pipelined/src/fpu/divconv.sv index 7fbafeb1..88596451 100755 --- a/wally-pipelined/src/fpu/divconv.sv +++ b/wally-pipelined/src/fpu/divconv.sv @@ -1,18 +1,41 @@ -module divconv ( +/////////////////////////////////////////// +// +// Written: James Stine +// Modified: 9/28/2021 +// +// Purpose: Main convergence routine for floating point divider/square root unit (Goldschmidt) +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// - input logic [52:0] d, n, +module divconv ( + input logic [52:0] d, n, input logic [2:0] sel_muxa, sel_muxb, - input logic sel_muxr, - input logic load_rega, load_regb, load_regc, load_regd, - input logic load_regr, load_regs, - input logic P, - input logic op_type, - input logic exp_odd, - input logic reset, - input logic clk, - + input logic sel_muxr, + input logic load_rega, load_regb, load_regc, load_regd, + input logic load_regr, load_regs, + input logic P, + input logic op_type, + input logic exp_odd, + input logic reset, + input logic clk, + output logic [63:0] q1, qp1, qm1, - output logic [63:0] q0, qp0, qm0, + output logic [63:0] q0, qp0, qm0, output logic [63:0] rega_out, regb_out, regc_out, regd_out, output logic [127:0] regr_out ); @@ -26,14 +49,11 @@ module divconv ( logic [63:0] mcand, mplier, mcand_q; logic [63:0] twocmp_out; logic [64:0] three; - logic [127:0] Carry, Carry2; - logic [127:0] Sum, Sum2; logic [127:0] constant, constant2; logic [63:0] q_const, qp_const, qm_const; logic [63:0] d2, n2; - logic [11:0] d3; - logic muxr_out; - logic cout1, cout2, cout3, cout4, cout5, cout6, cout7; + logic muxr_out; + logic cout1, cout2, cout3, cout4, cout5, cout6, cout7; // Check if exponent is odd for sqrt // If exp_odd=1 and sqrt, then M/2 and use ia_addr=0 as IA @@ -99,4 +119,3 @@ module divconv ( flopenr #(64) regk (clk, reset, load_regs, {qp_out0[63:39], (qp_out0[38:10] & {29{~P}}), 10'h0}, qp0); endmodule // divconv - diff --git a/wally-pipelined/src/fpu/fpdiv.sv b/wally-pipelined/src/fpu/fpdiv.sv index f0f81bd2..1f1788f9 100755 --- a/wally-pipelined/src/fpu/fpdiv.sv +++ b/wally-pipelined/src/fpu/fpdiv.sv @@ -60,8 +60,6 @@ module fpdiv ( logic Invalid; logic [4:0] FlagsIn; logic signResult; - logic convert; - logic sub; logic [63:0] q1, qm1, qp1, q0, qm0, qp0; logic [63:0] rega_out, regb_out, regc_out, regd_out; @@ -105,10 +103,10 @@ module fpdiv ( .load_regr, .load_regs, .P, .op_type, .exp_odd); // FSM : control divider - fsm control (.clk, .reset, .start, .op_type, - .done, .load_rega, .load_regb, .load_regc, .load_regd, - .load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, - .divBusy(FDivBusyE)); + fsm_fpdiv control (.clk, .reset, .start, .op_type, + .done, .load_rega, .load_regb, .load_regc, .load_regd, + .load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, + .divBusy(FDivBusyE)); // Round the mantissa to a 52-bit value, with the leading one // removed. The rounding units also handles special cases and diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv index 8258b9c6..fd91b1b2 100755 --- a/wally-pipelined/src/fpu/fpu.sv +++ b/wally-pipelined/src/fpu/fpu.sv @@ -129,16 +129,16 @@ module fpu ( logic [63:0] AlignedSrcAE; // align SrcA to the floating point format // DECODE STAGE + // calculate FP control signals fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW, .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, .FIntResSelD, .FmtD, .FrmD, .FWriteIntD); // FP register file - // - can read 3 registers and write 1 register every cycle fregfile fregfile (.clk, .reset, .we4(FRegWriteW), - .a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), - .wd4(FPUResultW), + .a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), + .a4(RdW), .wd4(FPUResultW), .rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D)); // D/E pipeline registers @@ -158,23 +158,23 @@ module fpu ( .FStallD, .FForwardXE, .FForwardYE, .FForwardZE); // forwarding muxs - mux3 #(64) fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE); - mux3 #(64) fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE); - mux3 #(64) fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE); - mux3 #(64) fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, - {2'b0, {10{1'b1}}, 52'b0}, - {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01)}, - FSrcYE); // Force Z to be 0 for multiply instructions + mux3 #(64) fxemux (FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE); + mux3 #(64) fyemux (FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE); + mux3 #(64) fzemux (FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE); + mux3 #(64) fyaddmux (FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, + {2'b0, {10{1'b1}}, 52'b0}, + {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01)}, + FSrcYE); // Force Z to be 0 for multiply instructions // Force Z to be 0 for multiply instructions - mux3 #(64) fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE); + mux3 #(64) fzmulmux (FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE); // unpacking unit // - splits FP inputs into their various parts // - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity) - unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, - .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, - .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, - .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE); + unpacking unpacking (.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, + .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, + .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, + .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE); // FMA // - two stage FMA @@ -191,7 +191,7 @@ module fpu ( .FmtE, .FmtM, .FrmM, .FMAFlgM, .FMAResM); - // capture the inputs for divide/sqrt + // fpdivsqrt using Goldschmidt's iteration floprc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E), .clear(FDivSqrtDoneE), .reset(reset), .clk(FDivBusyE)); @@ -201,12 +201,11 @@ module fpu ( floprc #(6) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE}), .q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ}), .clear(FDivSqrtDoneE), - .reset(reset), .clk(FDivBusyE)); - // fpdivsqrt using Goldschmidt's iteration + .reset(reset), .clk(FDivBusyE)); fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), - .reset, .clk(clk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1), - .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ, - .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM)); + .reset, .clk(clk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1), + .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ, + .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM)); // convert from signle to double and vice versa cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE); @@ -221,17 +220,14 @@ module fpu ( .Invalid(CmpNVE), .CmpResE); // sign injection unit - // - computation is done in one stage fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE, .SgnNVE, .SgnResE); // classify - // - computation is done in one stage - // - most of the work is done in the unpacking unit - // - result is written to the integer register fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, .XSNaNE, .ClassResE); - + + // Convert fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE, .CvtResE, .CvtFlgE); @@ -254,22 +250,23 @@ module fpu ( // E/M pipe registers // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM); - flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM}); - flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM}); - flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM}); - flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, - {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE}, - {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM}); - flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, FResE, FResM); - flopenrc #(5) EMRegCmpFlg(clk, reset, FlushM, ~StallM, FFlgE, FFlgM); - flopenrc #(`XLEN) EMRegSgnRes(clk, reset, FlushM, ~StallM, FIntResE, FIntResM); - flopenrc #(11) EMCtrlReg(clk, reset, FlushM, ~StallM, - {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE}, - {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM}); + flopenrc #(65) EMFpReg2 (clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM}); + flopenrc #(65) EMFpReg3 (clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM}); + flopenrc #(64) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM}); + flopenrc #(12) EMFpReg5 (clk, reset, FlushM, ~StallM, + {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE}, + {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM}); + flopenrc #(64) EMRegCmpRes (clk, reset, FlushM, ~StallM, FResE, FResM); + flopenrc #(5) EMRegCmpFlg (clk, reset, FlushM, ~StallM, FFlgE, FFlgM); + flopenrc #(`XLEN) EMRegSgnRes (clk, reset, FlushM, ~StallM, FIntResE, FIntResM); + flopenrc #(11) EMCtrlReg (clk, reset, FlushM, ~StallM, + {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE}, + {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM}); // BEGIN MEMORY STAGE + // FPU flag selection - to privileged - mux4 #(5) FPUFlgMux(5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM); + mux4 #(5) FPUFlgMux (5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM); // M/W pipe registers flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); @@ -285,10 +282,10 @@ module fpu ( // put ReadData into NaN-blocking format // - if there are any unsused bits the most significant bits are filled with 1s // - for load instruction - mux2 #(64) ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW); + mux2 #(64) ReadResMux ({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW); // select the result to be written to the FP register - mux4 #(64) FPUResultMux(ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW); + mux4 #(64) FPUResultMux (ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW); end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low assign FStallD = 0; diff --git a/wally-pipelined/src/fpu/fsm.sv b/wally-pipelined/src/fpu/fsm_fpdiv.sv similarity index 99% rename from wally-pipelined/src/fpu/fsm.sv rename to wally-pipelined/src/fpu/fsm_fpdiv.sv index 9b0e18a7..14358758 100755 --- a/wally-pipelined/src/fpu/fsm.sv +++ b/wally-pipelined/src/fpu/fsm_fpdiv.sv @@ -22,7 +22,7 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -module fsm ( +module fsm_fpdiv ( input logic clk, input logic reset, input logic start, diff --git a/wally-pipelined/src/fpu/rounder_div.sv b/wally-pipelined/src/fpu/rounder_div.sv index 1eb50644..03dcff7a 100755 --- a/wally-pipelined/src/fpu/rounder_div.sv +++ b/wally-pipelined/src/fpu/rounder_div.sv @@ -52,13 +52,13 @@ module rounder_div ( output logic [4:0] Flags ); - logic Rsign; - logic [10:0] Rexp; - logic [12:0] Texp; - logic [51:0] Rmant; - logic [63:0] Tmant; - logic [51:0] Smant; - logic Rzero; + logic Rsign; + logic [10:0] Rexp; + logic [12:0] Texp; + logic [51:0] Rmant; + logic [63:0] Tmant; + logic [51:0] Smant; + logic Rzero; logic Gdp, Gsp, G; logic UnFlow_SP, UnFlow_DP, UnderFlow; logic OvFlow_SP, OvFlow_DP, OverFlow; @@ -187,9 +187,9 @@ module rounder_div ( assign NaN_Sign_out = ~XNaNQ&YNaNQ ? Float2[63] : Float1[63]; assign Sign_out = (XZeroQ&YZeroQ | XInfQ&YInfQ)&~op_type | Rsign&~XNaNQ&~YNaNQ | NaN_Sign_out&(XNaNQ|YNaNQ); - // FIXME (jes) - Imperas gives sNaN a Sign=0 where x86 gives Sign=1 - // | Float1[63]&op_type; + // | Float1[63]&op_type; (logic to fix this but removed for now) + assign Rmant[51] = Largest | NaN | (Smant[51]&~Infinite&~Rzero); assign Rmant[50:0] = ({51{Largest}} | (Smant[50:0]&{51{~Infinite&Valid&~Rzero}}) | (NaN_out&{51{NaN}}))&({51{~(op_type&Float1[63]&~XZeroQ)}}); diff --git a/wally-pipelined/testbench/testbench-f64.sv b/wally-pipelined/testbench/testbench-f64.sv index 5ae96f83..e3cdc84d 100755 --- a/wally-pipelined/testbench/testbench-f64.sv +++ b/wally-pipelined/testbench/testbench-f64.sv @@ -49,7 +49,6 @@ module testbench (); integer handle3; integer desc3; - integer desc4; // instantiate device under test unpacking unpacking(.X(op1), .Y(op2), .Z(64'h0), .FOpCtrlE, .FmtE, @@ -111,8 +110,6 @@ module testbench (); @(posedge clk); $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected)); vectornum = vectornum + 1; - if (vectornum == 40) - $finish; if (testvectors[vectornum] === 200'bx) begin $display("%d tests completed", vectornum); $finish;