Merge branch 'main' of github.com:davidharrishmc/riscv-wally

2022-12-29 17:07:53 -06:00 · 2022-12-29 17:07:53 -06:00 · c725b5534a
commit c725b5534a
parent 654b10894c 776f4714af
16 changed files with 239 additions and 165 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ __pycache__/
 #External repos
 addins/riscv-arch-test/Makefile.include
 addins/riscv-tests/target
+addins/TestFloat-3e/build/Linux-x86_64-GCC/*
 benchmarks/embench/wally*.json

 #vsim work files to ignore
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@ -110,7 +110,7 @@

 // division constants
 `define RADIX 32'h2
-`define DIVCOPIES 32'h1
+`define DIVCOPIES 32'h4
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : `NF+3)
 // `define DIVN (`NF < `XLEN ? `XLEN : `NF+1) // length of input
 `define DIVN (`NF<`XLEN ? `XLEN : (`NF + 3)) // length of input
--- a/pipelined/radixcopiesmultiregression.sh
+++ b/pipelined/radixcopiesmultiregression.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+
+configFile=config/shared/wally-shared.vh
+
+searchRadix="define RADIX 32'"..
+searchCopies="define DIVCOPIES 32'"..
+
+currRadix="define RADIX 32'h2"
+currCopies="define DIVCOPIES 32'h1"
+sed -i "s/$searchRadix/$currRadix/" $configFile
+sed -i "s/$searchCopies/$currCopies/" $configFile
+echo regression on Radix :$currRadix: and Copies :$currCopies:
+./regression/regression-wally
+
+currRadix="define RADIX 32'h2"
+currCopies="define DIVCOPIES 32'h2"
+sed -i "s/$searchRadix/$currRadix/" $configFile
+sed -i "s/$searchCopies/$currCopies/" $configFile
+echo regression on Radix :$currRadix: and Copies :$currCopies:
+./regression/regression-wally
+
+currRadix="define RADIX 32'h2"
+currCopies="define DIVCOPIES 32'h4"
+sed -i "s/$searchRadix/$currRadix/" $configFile
+sed -i "s/$searchCopies/$currCopies/" $configFile
+echo regression on Radix :$currRadix: and Copies :$currCopies:
+./regression/regression-wally
+
+currRadix="define RADIX 32'h4"
+currCopies="define DIVCOPIES 32'h1"
+sed -i "s/$searchRadix/$currRadix/" $configFile
+sed -i "s/$searchCopies/$currCopies/" $configFile
+echo regression on Radix :$currRadix: and Copies :$currCopies:
+./regression/regression-wally
+
+currRadix="define RADIX 32'h4"
+currCopies="define DIVCOPIES 32'h2"
+sed -i "s/$searchRadix/$currRadix/" $configFile
+sed -i "s/$searchCopies/$currCopies/" $configFile
+echo regression on Radix :$currRadix: and Copies :$currCopies:
+./regression/regression-wally
+
+currRadix="define RADIX 32'h4"
+currCopies="define DIVCOPIES 32'h4"
+sed -i "s/$searchRadix/$currRadix/" $configFile
+sed -i "s/$searchCopies/$currCopies/" $configFile
+echo regression on Radix :$currRadix: and Copies :$currCopies:
+./regression/regression-wally
--- a/pipelined/src/fpu/fctrl.sv
+++ b/pipelined/src/fpu/fctrl.sv
@ -81,9 +81,9 @@ module fctrl (
                         (Fmt == 2'b10 & `ZFH_SUPPORTED) | (Fmt == 2'b11 & `Q_SUPPORTED));
  always_comb
    if (STATUS_FS == 2'b00) // FPU instructions are illegal when FPU is disabled
-      ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0;
+      ControlsD = `FCTRLW'b0_0_00_xx_000_0_1_0;
    else if (OpD != 7'b0000111 & OpD != 7'b0100111 & ~SupportedFmt) 
-      ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // for anything other than loads and stores, check for supported format
+      ControlsD = `FCTRLW'b0_0_00_xx_000_0_1_0; // for anything other than loads and stores, check for supported format
    else case(OpD)
    // FRegWrite_FWriteInt_FResSel_PostProcSel_FOpCtrl_FDivStart_IllegalFPUInstr_FCvtInt
      7'b0000111: case(Funct3D)
@ -94,7 +94,7 @@ module fctrl (
                             else                ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // flq not supported
                    3'b001:  if (`ZFH_SUPPORTED) ControlsD = `FCTRLW'b1_0_10_xx_0xx_0_0_0; // flh
                             else                ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // flh not supported
-                    default:                     ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // non-implemented instruction
+                    default:                     ControlsD = `FCTRLW'b0_0_00_xx_000_0_1_0; // non-implemented instruction
                  endcase
      7'b0100111: case(Funct3D)
                    3'b010:                      ControlsD = `FCTRLW'b0_0_10_xx_0xx_0_0_0; // fsw
@ -104,7 +104,7 @@ module fctrl (
                             else                ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // fsq not supported
                    3'b001:  if (`ZFH_SUPPORTED) ControlsD = `FCTRLW'b0_0_10_xx_0xx_0_0_0; // fsh
                             else                ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // fsh not supported
-                    default:                     ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // non-implemented instruction
+                    default:                     ControlsD = `FCTRLW'b0_0_00_xx_000_0_1_0; // non-implemented instruction
                  endcase
      7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0_0; // fmadd
      7'b1000111:   ControlsD = `FCTRLW'b1_0_01_10_001_0_0_0; // fmsub
@ -120,23 +120,23 @@ module fctrl (
                                  3'b000:  ControlsD = `FCTRLW'b1_0_00_xx_000_0_0_0; // fsgnj
                                  3'b001:  ControlsD = `FCTRLW'b1_0_00_xx_001_0_0_0; // fsgnjn
                                  3'b010:  ControlsD = `FCTRLW'b1_0_00_xx_010_0_0_0; // fsgnjx
-                                  default: ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // non-implemented instruction
+                                  default: ControlsD = `FCTRLW'b0_0_00_xx_000_0_1_0; // non-implemented instruction
                                endcase
                    7'b00101??: case(Funct3D)
                                  3'b000:  ControlsD = `FCTRLW'b1_0_00_xx_110_0_0_0; // fmin
                                  3'b001:  ControlsD = `FCTRLW'b1_0_00_xx_101_0_0_0; // fmax
-                                  default: ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // non-implemented instruction
+                                  default: ControlsD = `FCTRLW'b0_0_00_xx_000_0_1_0; // non-implemented instruction
                                endcase
                    7'b10100??: case(Funct3D)
                                  3'b010:  ControlsD = `FCTRLW'b0_1_00_xx_010_0_0_0; // feq
                                  3'b001:  ControlsD = `FCTRLW'b0_1_00_xx_001_0_0_0; // flt
                                  3'b000:  ControlsD = `FCTRLW'b0_1_00_xx_011_0_0_0; // fle
-                                  default: ControlsD = `FCTRLW'b0_0_00_xx_0xx__0_1_0; // non-implemented instruction
+                                  default: ControlsD = `FCTRLW'b0_0_00_xx_000__0_1_0; // non-implemented instruction
                                endcase
                    7'b11100??: if (Funct3D == 3'b001)          ControlsD = `FCTRLW'b0_1_10_xx_000_0_0_0; // fclass
                                else if (Funct3D[1:0] == 2'b00) ControlsD = `FCTRLW'b0_1_11_xx_000_0_0_0; // fmv.x.w   to int reg
                                else if (Funct3D[1:0] == 2'b01) ControlsD = `FCTRLW'b0_1_11_xx_000_0_0_0; // fmv.x.d   to int reg
-                                else                            ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // non-implemented instruction
+                                else                            ControlsD = `FCTRLW'b0_0_00_xx_000_0_1_0; // non-implemented instruction
                    7'b1101000: case(Rs2D[1:0])
                                  2'b00:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0; // fcvt.s.w   w->s
                                  2'b01:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0; // fcvt.s.wu wu->s
@ -165,7 +165,7 @@ module fctrl (
                                endcase
                    7'b1111001: ControlsD = `FCTRLW'b1_0_00_xx_011_0_0_0; // fmv.d.x   to fp reg
                    7'b0100001: ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0; // fcvt.d.s
-                    default:    ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1_0; // non-implemented instruction
+                    default:    ControlsD = `FCTRLW'b0_0_00_xx_000_0_1_0; // non-implemented instruction
                  endcase
      default:      ControlsD = `FCTRLW'b0_0_00_xx_000_0_1_0; // non-implemented instruction
    endcase
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@ -68,27 +68,27 @@ module fdivsqrt(
  logic [`DIVBLEN:0] nE, nM, mM;
  logic NegQuotM, ALTBM, AsM, W64M;
  logic DivStartE;
-  logic [`XLEN-1:0] ForwardedSrcAM;
+  logic [`XLEN-1:0] AM;

  fdivsqrtpreproc fdivsqrtpreproc(
    .clk, .IFDivStartE, .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), 
-    .Sqrt(SqrtE), .Ym(YmE), .XZeroE, .X, .DPreproc, .ForwardedSrcAM, .MDUM, .W64M,
+    .Sqrt(SqrtE), .Ym(YmE), .XZeroE, .X, .DPreproc, .AM, .MDUM, .W64M,
    .nE, .nM, .mM, .NegQuotM, .ALTBM, .AZeroM, .BZeroM, .AZeroE, .BZeroE, .AsM,
    .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .MDUE, .W64E);
  fdivsqrtfsm fdivsqrtfsm(
    .clk, .reset, .FmtE, .XsE, .SqrtE, .nE,
-    .FDivBusyE, .FDivStartE, .IDivStartE, .IFDivStartE, .FDivDoneE, .StallM, .FlushE, /*.DivDone, */ 
+    .FDivBusyE, .FDivStartE, .IDivStartE, .IFDivStartE, .FDivDoneE, .StallM, .FlushE, 
    .XZeroE, .YZeroE, .AZeroE, .BZeroE,
    .XNaNE, .YNaNE, .MDUE,
    .XInfE, .YInfE, .WZeroE, .SpecialCaseM);
  fdivsqrtiter fdivsqrtiter(
-    .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, // .SqrtM,
+    .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, 
    .X,.DPreproc, .FirstWS(WS), .FirstWC(WC),
    .IFDivStartE, .FDivBusyE);
  fdivsqrtpostproc fdivsqrtpostproc(
    .clk, .reset, .StallM,
    .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, .Firstun, 
-    .SqrtM, .SpecialCaseM, .RemOpM(Funct3M[1]), .ForwardedSrcAM,
+    .SqrtM, .SpecialCaseM, .RemOpM(Funct3M[1]), .AM,
    .nM, .ALTBM, .mM, .BZeroM, .AsM, .NegQuotM, .W64M,
    .QmM, .WZeroE, .DivSM, .FPIntDivResultM);
 endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@ -34,7 +34,7 @@ module fdivsqrtexpcalc(
  input  logic [`FMTBITS-1:0] Fmt,
  input  logic [`NE-1:0] Xe, Ye,
  input  logic Sqrt,
-  input  logic XZeroE, 
+  input  logic XZero, 
  input  logic [`DIVBLEN:0] ell, m,
  output logic [`NE+1:0] Qe
  );
@ -70,7 +70,7 @@ module fdivsqrtexpcalc(
  assign SXExp = {2'b0, Xe} - {{(`NE+1-`DIVBLEN){1'b0}}, ell} - (`NE+2)'(`BIAS);
  assign SExp  = {SXExp[`NE+1], SXExp[`NE+1:1]} + {2'b0, Bias};
  // correct exponent for denormalized input's normalization shifts
-  assign DExp  = ({2'b0, Xe} - {{(`NE+1-`DIVBLEN){1'b0}}, ell} - {2'b0, Ye} + {{(`NE+1-`DIVBLEN){1'b0}}, m} + {3'b0, Bias}) & {`NE+2{~XZeroE}};
+  assign DExp  = ({2'b0, Xe} - {{(`NE+1-`DIVBLEN){1'b0}}, ell} - {2'b0, Ye} + {{(`NE+1-`DIVBLEN){1'b0}}, m} + {3'b0, Bias}) & {`NE+2{~XZero}};
  
  assign Qe = Sqrt ? SExp : DExp;
 endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@ -65,8 +65,10 @@ module fdivsqrtfsm(

  // terminate immediately on special cases
  assign FSpecialCaseE = XZeroE | (YZeroE&~SqrtE) | XInfE | YInfE | XNaNE | YNaNE | (XsE&SqrtE);
+  if (`IDIV_ON_FPU) begin
    assign ISpecialCaseE = AZeroE | BZeroE; // *** why is AZeroE part of this.  Should other special cases be considered?
    assign SpecialCaseE  = MDUE ? ISpecialCaseE : FSpecialCaseE;
+  end else assign SpecialCaseE = FSpecialCaseE;
  flopenr #(1) SpecialCaseReg(clk, reset, ~StallM, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc

 // DIVN = `NF+3
@ -103,7 +105,8 @@ module fdivsqrtfsm(
  always_comb begin 
    if (SqrtE) fbits = Nf + 2 + 2; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2
    else       fbits = Nf + 2 + `LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
-    cycles =  MDUE ? (nE + 1) : (fbits + (`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES);
+    if (`IDIV_ON_FPU) cycles =  MDUE ? ((nE + 1)/`DIVCOPIES) : (fbits + (`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES);
+    else              cycles = (fbits + (`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES);
  end 

  /* verilator lint_on WIDTH */
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@ -39,7 +39,7 @@ module fdivsqrtpostproc(
  input  logic [`DIVb+1:0]  FirstC,
  input  logic              SqrtE,
  input  logic              Firstun, SqrtM, SpecialCaseM, NegQuotM,
-	input  logic [`XLEN-1:0]  ForwardedSrcAM,
+	input  logic [`XLEN-1:0]  AM,
  input  logic              RemOpM, ALTBM, BZeroM, AsM, W64M,
  input  logic [`DIVBLEN:0] nM, mM,
  output logic [`DIVb:0]    QmM, 
@ -98,11 +98,14 @@ module fdivsqrtpostproc(

  // Determine if sticky bit is negative  // *** look for ways to optimize this.  Shift shouldn't be needed.
  assign Sum = WC + WS;
-  assign W = $signed(Sum) >>> `LOGR;
-  assign NegStickyM = W[`DIVb+3];
-  assign DM = {4'b0001, D};
+  assign NegStickyM = Sum[`DIVb+3];
  
-  // *** put conditionals on integer division hardware, move to its own module
+  assign PreQmM = NegStickyM ? FirstUM : FirstU; // Select U or U-1 depending on negative sticky bit
+  assign QmM = SqrtM ? (PreQmM << 1) : PreQmM;
+
+  if (`IDIV_ON_FPU) begin
+    assign W = $signed(Sum) >>> `LOGR;
+    assign DM = {4'b0001, D};

    // Integer division: sign handling for div and rem
    always_comb 
@ -127,7 +130,7 @@ module fdivsqrtpostproc(
    always_comb
      if (ALTBM) begin
        IntQuotM = '0;
-      IntRemM  = {{(`DIVb-`XLEN+4){1'b0}}, ForwardedSrcAM};
+        IntRemM  = {{(`DIVb-`XLEN+4){1'b0}}, AM};
      end else begin
        logic [`DIVb+3:0] PreIntQuotM;
        if (WZeroM) begin
@ -167,10 +170,8 @@ module fdivsqrtpostproc(
    // division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
    
    assign PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
-  assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? ForwardedSrcAM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
+    assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
    // *** conditional on RV64
    assign FPIntDivResultM = (W64M ? {{(`XLEN-32){SpecialFPIntDivResultM[31]}}, SpecialFPIntDivResultM[31:0]} : SpecialFPIntDivResultM[`XLEN-1:0]); // Sign extending in case of W64
- 
-  assign PreQmM = NegStickyM ? FirstUM : FirstU; // Select U or U-1 depending on negative sticky bit
-  assign QmM = SqrtM ? (PreQmM << 1) : PreQmM;
+  end
 endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@ -47,7 +47,7 @@ module fdivsqrtpreproc (
  output logic [`NE+1:0] QeM,
  output logic [`DIVb+3:0] X,
  output logic [`DIVb-1:0] DPreproc,
-  output logic [`XLEN-1:0] ForwardedSrcAM
+  output logic [`XLEN-1:0] AM
 );

  logic  [`DIVb-1:0] XPreproc;
@ -56,9 +56,6 @@ module fdivsqrtpreproc (
  logic  [`NE+1:0] QeE;
  // Intdiv signals
  logic  [`DIVb-1:0] IFNormLenX, IFNormLenD;
-  logic  [`XLEN-1:0] PosA, PosB;
-  logic  AsE, BsE, ALTBE, NegQuotE;
-  logic  [`XLEN-1:0]  A64, B64, A64Src;
  logic  [`DIVBLEN:0] mE;
  logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
  logic  [`DIVBLEN:0] pPlusr, pPrCeil, p, ell;
@ -69,77 +66,98 @@ module fdivsqrtpreproc (
  // ***can probably merge X LZC with conversion
  // cout the number of leading zeros

-  // *** W64 muxes conditional on RV64
-  assign AsE = ~Funct3E[0] & (W64E ? ForwardedSrcAE[31] : ForwardedSrcAE[`XLEN-1]);
-  assign BsE = ~Funct3E[0] & (W64E ? ForwardedSrcBE[31] : ForwardedSrcBE[`XLEN-1]);
-  assign A64 = W64E ? {{(`XLEN-32){AsE}}, ForwardedSrcAE[31:0]} : ForwardedSrcAE;
-  assign B64 = W64E ? {{(`XLEN-32){BsE}}, ForwardedSrcBE[31:0]} : ForwardedSrcBE;
-  assign A64Src = W64E ? {{(`XLEN-32){ForwardedSrcAE[31]}}, ForwardedSrcAE[31:0]} : ForwardedSrcAE;
+  if (`IDIV_ON_FPU) begin
+    logic signedDiv;
+    logic  AsE, BsE, ALTBE, NegQuotE;
+    logic  [`XLEN-1:0]  AE, BE;
+    logic  [`XLEN-1:0] PosA, PosB;

-  assign NegQuotE = (AsE ^ BsE) & MDUE;
-  
-  assign PosA = AsE ? -A64 : A64;
-  assign PosB = BsE ? -B64 : B64;
+    // Extract inputs, signs, zero, depending on W64 mode if applicable
+    assign signedDiv = ~Funct3E[0];
+    if (`XLEN==64) begin // 64-bit, supports W64
+      assign AsE = signedDiv & (W64E ? ForwardedSrcAE[31] : ForwardedSrcAE[`XLEN-1]);
+      assign BsE = signedDiv & (W64E ? ForwardedSrcBE[31] : ForwardedSrcBE[`XLEN-1]);
+      assign AE = W64E ? {{(`XLEN-32){AsE}}, ForwardedSrcAE[31:0]} : ForwardedSrcAE;  
+      assign BE = W64E ? {{(`XLEN-32){BsE}}, ForwardedSrcBE[31:0]} : ForwardedSrcBE;
      assign AZeroE = W64E ? ~(|ForwardedSrcAE[31:0]) : ~(|ForwardedSrcAE);
      assign BZeroE = W64E ? ~(|ForwardedSrcBE[31:0]) : ~(|ForwardedSrcBE);
+    end else begin // 32 bits only
+      assign AsE = signedDiv & ForwardedSrcAE[`XLEN-1];
+      assign BsE = signedDiv & ForwardedSrcBE[`XLEN-1];
+      assign AE = ForwardedSrcAE;
+      assign BE = ForwardedSrcBE;
+      assign AZeroE = ~(|ForwardedSrcAE);
+      assign BZeroE = ~(|ForwardedSrcBE);
+    end

+    // Quotient is negative
+    assign NegQuotE = (AsE ^ BsE) & MDUE;
+    
+    // Force inputs to be postiive
+    assign PosA = AsE ? -AE : AE;
+    assign PosB = BsE ? -BE : BE;
+
+    // Select integer or floating point inputs 
    assign IFNormLenX = MDUE ? {PosA, {(`DIVb-`XLEN){1'b0}}} : {Xm, {(`DIVb-`NF-1){1'b0}}};
    assign IFNormLenD = MDUE ? {PosB, {(`DIVb-`XLEN){1'b0}}} : {Ym, {(`DIVb-`NF-1){1'b0}}};
-  lzc #(`DIVb) lzcX (IFNormLenX, ell);
-  lzc #(`DIVb) lzcY (IFNormLenD, mE);
-
-  assign XPreproc = IFNormLenX << (ell + {{`DIVBLEN{1'b0}}, 1'b1}); // had issue with (`DIVBLEN+1)'(~MDUE) so using this instead
-  assign DPreproc = IFNormLenD << (mE + {{`DIVBLEN{1'b0}}, 1'b1}); // replaced ~MDUE with 1 bc we always want that extra left shift

+    // Difference in number of leading zeros
    assign ZeroDiff = mE - ell;
    assign ALTBE = ZeroDiff[`DIVBLEN]; // A less than B
    assign p = ALTBE ? '0 : ZeroDiff;

-/* verilator lint_off WIDTH */
+  /* verilator lint_off WIDTH */
+    // right shift amount to complete in discrete number of steps
    assign pPlusr = (`DIVBLEN)'(`LOGR) + p;
    assign pPrTrunc = pPlusr % `RK;
-//assign pPrTrunc = (`LOGRK == 0) ? 0 : pPlusr[`LOGRK-1:0];
    assign pPrCeil = (pPlusr >> `LOGRK) + {{`DIVBLEN{1'b0}}, |(pPrTrunc)};
    assign nE = (pPrCeil * (`DIVBLEN+1)'(`DIVCOPIES)) - {{(`DIVBLEN){1'b0}}, 1'b1};
    assign IntBits = (`DIVBLEN)'(`LOGR) + p - {{(`DIVBLEN){1'b0}}, 1'b1};
    assign RightShiftX = ((`DIVBLEN)'(`RK) - 1) - (IntBits % `RK);
-//assign RightShiftX = (`LOGRK == 0) ? 0 : ((`DIVBLEN)'(`RK) - 1) - {{(`DIVBLEN - `RK){1'b0}}, IntBits[`LOGRK-1:0]};
-/* verilator lint_on WIDTH */
+  /* verilator lint_on WIDTH */

+    // Selet integer or floating-point operands
    assign NumZeroE = MDUE ? AZeroE : XZeroE;
+    assign X = MDUE ? DivX >> RightShiftX : PreShiftX;

+    // pipeline registers
+    flopen #(1)        mdureg(clk, IFDivStartE, MDUE, MDUM);
+    flopen #(1)        w64reg(clk, IFDivStartE, W64E, W64M);
+    flopen #(`DIVBLEN+1) nreg(clk, IFDivStartE, nE, nM);
+    flopen #(`DIVBLEN+1) mreg(clk, IFDivStartE, mE, mM);
+    flopen #(1)       altbreg(clk, IFDivStartE, ALTBE, ALTBM);
+    flopen #(1)    negquotreg(clk, IFDivStartE, NegQuotE, NegQuotM);
+    flopen #(1)      azeroreg(clk, IFDivStartE, AZeroE, AZeroM);
+    flopen #(1)      bzeroreg(clk, IFDivStartE, BZeroE, BZeroM);
+    flopen #(1)      asignreg(clk, IFDivStartE, AsE, AsM);
+    flopen #(`XLEN)   srcareg(clk, IFDivStartE, AE, AM);
+
+  end else begin
+    assign IFNormLenX = {Xm, {(`DIVb-`NF-1){1'b0}}};
+    assign IFNormLenD = {Ym, {(`DIVb-`NF-1){1'b0}}};
+    assign NumZeroE = XZeroE;
+    assign X = PreShiftX;
+  end
+
+  // count leading zeros for denorm FP and to normalize integer inputs
+  lzc #(`DIVb) lzcX (IFNormLenX, ell);
+  lzc #(`DIVb) lzcY (IFNormLenD, mE);
+
+  // Normalization shift
+  assign XPreproc = IFNormLenX << (ell + {{`DIVBLEN{1'b0}}, 1'b1}); 
+  assign DPreproc = IFNormLenD << (mE + {{`DIVBLEN{1'b0}}, 1'b1}); 
+
+  //  append leading 1 (for nonzero inputs) and zero-extend
  assign SqrtX = (Xe[0]^ell[0]) ? {1'b0, ~NumZeroE, XPreproc[`DIVb-1:1]} : {~NumZeroE, XPreproc}; // Bottom bit of XPreproc is always zero because DIVb is larger than XLEN and NF
  assign DivX = {3'b000, ~NumZeroE, XPreproc};

  // *** explain why X is shifted between radices (initial assignment of WS=RX)
  if (`RADIX == 2)  assign PreShiftX = Sqrt ? {3'b111, SqrtX} : DivX;
  else              assign PreShiftX = Sqrt ? {2'b11, SqrtX, 1'b0} : DivX;
-  assign X = MDUE ? DivX >> RightShiftX : PreShiftX;

-  fdivsqrtexpcalc expcalc(.Fmt, .Xe, .Ye, .Sqrt, .XZeroE, .ell, .m(mE), .Qe(QeE));
+  // Floating-point exponent
+  fdivsqrtexpcalc expcalc(.Fmt, .Xe, .Ye, .Sqrt, .XZero(XZeroE), .ell, .m(mE), .Qe(QeE));

-  //           radix 2     radix 4
-  // 1 copies  DIVLEN+2    DIVLEN+2/2
-  // 2 copies  DIVLEN+2/2  DIVLEN+2/2*2
-  // 4 copies  DIVLEN+2/4  DIVLEN+2/2*4
-  // 8 copies  DIVLEN+2/8  DIVLEN+2/2*8
-
-  // DIVRESLEN = DIVLEN or DIVLEN+2
-  // r = 1 or 2
-  // DIVRESLEN/(r*`DIVCOPIES)
-
-  flopen #(1)    negquotreg(clk, IFDivStartE, NegQuotE, NegQuotM);
-  flopen #(1)       altbreg(clk, IFDivStartE, ALTBE, ALTBM);
-  flopen #(1)      azeroreg(clk, IFDivStartE, AZeroE, AZeroM);
-  flopen #(1)      bzeroreg(clk, IFDivStartE, BZeroE, BZeroM);
-  flopen #(1)      asignreg(clk, IFDivStartE, AsE, AsM);
-  flopen #(1)        mdureg(clk, IFDivStartE, MDUE, MDUM);
-  flopen #(1)        w64reg(clk, IFDivStartE, W64E, W64M);
-  flopen #(`DIVBLEN+1) nreg(clk, IFDivStartE, nE, nM);
-  flopen #(`DIVBLEN+1) mreg(clk, IFDivStartE, mE, mM);
  flopen #(`NE+2)    expreg(clk, IFDivStartE, QeE, QeM);
-  flopen #(`XLEN)   srcareg(clk, IFDivStartE, A64Src, ForwardedSrcAM);
-
-
 endmodule

--- a/pipelined/src/muldiv/intdivrestoring.sv
+++ b/pipelined/src/muldiv/intdivrestoring.sv
--- a/pipelined/src/muldiv/intdivrestoringstep.sv
+++ b/pipelined/src/muldiv/intdivrestoringstep.sv
--- a/pipelined/src/muldiv/muldiv.sv
+++ b/pipelined/src/muldiv/muldiv.sv
@ -1,5 +1,5 @@
 ///////////////////////////////////////////
-// muldiv.sv
+// mdu.sv
 //
 // Written: David_Harris@hmc.edu 9 January 2021
 // Modified: 
@ -30,7 +30,7 @@

 `include "wally-config.vh"

-module muldiv (
+module mdu (
 	       input logic 		clk, reset,
 	       // Execute Stage interface
 	       //    input logic [`XLEN-1:0] 	SrcAE, SrcBE,
@ -94,6 +94,6 @@ module muldiv (

 	// Writeback stage pipeline register
 	flopenrc #(`XLEN) MDUResultWReg(clk, reset, FlushW, ~StallW, MDUResultM, MDUResultW);	 
-endmodule // muldiv
+endmodule // mdu


--- a/pipelined/src/muldiv/mul.sv
+++ b/pipelined/src/muldiv/mul.sv
--- a/pipelined/src/wally/wallypipelinedcore.sv
+++ b/pipelined/src/wally/wallypipelinedcore.sv
@ -370,7 +370,7 @@ module wallypipelinedcore (
      assign BigEndianM = 0;
   end
   if (`M_SUPPORTED) begin:mdu
-      muldiv mdu(
+      mdu mdu(
         .clk, .reset,
         .ForwardedSrcAE, .ForwardedSrcBE, 
         .Funct3E, .Funct3M, .MDUE, .W64E,
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@ -82,7 +82,7 @@ module testbenchfp;
 	logic [`LOGCVTLEN-1:0] CvtShiftAmtE;  // how much to shift by
 	logic [`DIVb:0] Quot;
  logic CvtResDenormUfE;
-  logic DivStart, FDivBusyE;
+  logic DivStart, FDivBusyE, OldFDivBusyE;
  logic reset = 1'b0;
  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
  logic [`DURLEN-1:0] Dur;
@ -689,12 +689,12 @@ module testbenchfp;
            .Xe(Xe), .Ye(Ye), .Ze(Ze), 
            .Xm(Xm), .Ym(Ym), .Zm(Zm),
            .XZero, .YZero, .ZZero, .Ss, .Se,
-            .OpCtrl(OpCtrlVal), .Fmt(ModFmt), .Sm, .InvA, .SCnt, .As, .Ps,
+            .OpCtrl(OpCtrlVal), .Sm, .InvA, .SCnt, .As, .Ps,
            .ZmSticky); 
  end
              
  postprocess postprocess(.Xs(Xs), .Ys(Ys), .PostProcSel(UnitVal[1:0]),
-              .Ze(Ze),  .ZDenorm(ZDenorm), .OpCtrl(OpCtrlVal), .DivQm(Quot), .DivQe(DivCalcExp),
+              .ZDenorm(ZDenorm), .OpCtrl(OpCtrlVal), .DivQm(Quot), .DivQe(DivCalcExp),
              .Xm(Xm), .Ym(Ym), .Zm(Zm), .CvtCe(CvtCalcExpE), .DivS(DivSticky), .FmaSs(Ss),
              .XNaN(XNaN), .YNaN(YNaN), .ZNaN(ZNaN), .CvtResDenormUf(CvtResDenormUfE),
              .XZero(XZero), .YZero(YZero), .ZZero(ZZero), .CvtShiftAmt(CvtShiftAmtE),
@ -719,8 +719,8 @@ module testbenchfp;
    fdivsqrt fdivsqrt(.clk, .reset, .XsE(Xs), .FmtE(ModFmt), .XmE(Xm), .YmE(Ym), .XeE(Xe), .YeE(Ye), .SqrtE(OpCtrlVal[0]), .SqrtM(OpCtrlVal[0]),
                    .XInfE(XInf), .YInfE(YInf), .XZeroE(XZero), .YZeroE(YZero), .XNaNE(XNaN), .YNaNE(YNaN), 
                    .FDivStartE(DivStart), .IDivStartE(1'b0), .MDUE(1'b0), .W64E(1'b0),
-                    .StallE(1'b0), .StallM(1'b0), .DivSM(DivSticky), .FDivBusyE, .QeM(DivCalcExp),
-                    .QmM(Quot), .DivDone);
+                    .StallM(1'b0), .DivSM(DivSticky), .FDivBusyE, .QeM(DivCalcExp),
+                    .QmM(Quot));
  end

  assign CmpFlg[3:0] = 0;
@ -811,6 +811,9 @@ end

  logic ResMatch, FlagMatch, CheckNow;

+always @(posedge clk) 
+  OldFDivBusyE = FDivBusyE;
+
 // check results on falling edge of clk
 always @(negedge clk) begin

@ -883,6 +886,7 @@ always @(negedge clk) begin
    ResMatch = (Res === Ans | NaNGood | NaNGood === 1'bx);
    FlagMatch = (ResFlg === AnsFlg | AnsFlg === 5'bx);
    divsqrtop = OpCtrlVal == `SQRT_OPCTRL | OpCtrlVal == `DIV_OPCTRL;
+    assign DivDone = OldFDivBusyE & ~FDivBusyE;

    //assign divsqrtop = OpCtrl[TestNum] == `SQRT_OPCTRL | OpCtrl[TestNum] == `DIV_OPCTRL;
    CheckNow = (DivDone | ~divsqrtop) & (UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT);
--- a/pipelined/testbench/tests.vh
+++ b/pipelined/testbench/tests.vh
@ -138,6 +138,7 @@ string tvpaths[] = '{
  
 string imperas32f[] = '{
    `IMPERASTEST,
+    "rv32i_m/F/FSQRT-S-DYN-RDN-01",
    "rv32i_m/F/FADD-S-DYN-RDN-01",
    "rv32i_m/F/FADD-S-DYN-RMM-01",
    "rv32i_m/F/FADD-S-DYN-RNE-01",
@ -1198,8 +1199,6 @@ string imperas32f[] = '{

  string arch64d[] = '{
    `RISCVARCHTEST,
-    "rv64i_m/D/src/fsqrt.d_b1-01.S",
-    "rv64i_m/D/src/fdiv.d_b20-01.S",
    "rv64i_m/D/src/fadd.d_b10-01.S",
    "rv64i_m/D/src/fadd.d_b1-01.S",
    "rv64i_m/D/src/fadd.d_b11-01.S",