diff --git a/pipelined/src/cache/cache.sv b/pipelined/src/cache/cache.sv
index 696a9c921..cf02972d7 100644
--- a/pipelined/src/cache/cache.sv
+++ b/pipelined/src/cache/cache.sv
@@ -8,96 +8,108 @@
 //
 // A component of the Wally configurable RISC-V project.
 //
-// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
-// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
 //
-// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
-// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
-// may obtain a copy of the License at
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
 //
-// https://solderpad.org/licenses/SHL-2.1/
-//
-// Unless required by applicable law or agreed to in writing, any work distributed under the 
-// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
-// either express or implied. See the License for the specific language governing permissions 
-// and limitations under the License.
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 `include "wally-config.vh"
 
 module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGBWPL, WORDLEN, MUXINTERVAL, DCACHE) (
-  input  logic                   clk,
-  input  logic                   reset,
-  input  logic                   Stall,             // Stall the cache, preventing new accesses. In-flight access finished but does not return to READY
-  input  logic                   FlushStage,        // Pipeline flush of second stage (prevent writes and bus operations)
-  // cpu side
-  input  logic [1:0]             CacheRW,           // [1] Read, [0] Write 
-  input  logic [1:0]             CacheAtomic,       // Atomic operation
-  input  logic                   FlushCache,        // Flush all dirty lines back to memory
-  input  logic                   InvalidateCache,   // Clear all valid bits
-  input  logic [11:0]            NextAdr,           // Virtual address, but we only use the lower 12 bits.
-  input  logic [`PA_BITS-1:0]    PAdr,              // Physical address
-  input  logic [(WORDLEN-1)/8:0] ByteMask,          // Which bytes to write (D$ only)
-  input  logic [WORDLEN-1:0]     CacheWriteData,    // Data to write to cache (D$ only)
-  output logic                   CacheCommitted,    // Cache has started bus operation that shouldn't be interrupted
-  output logic                   CacheStall,        // Cache stalls pipeline during multicycle operation
-  output logic [WORDLEN-1:0]     ReadDataWord,      // Word read from cache (goes to CPU and bus)
-  // to performance counters to cpu
-  output logic                   CacheMiss,         // Cache miss
-  output logic                   CacheAccess,       // Cache access
-  // lsu control
-  input  logic                   SelHPTW,           // Use PAdr from Hardware Page Table Walker rather than NextAdr
-  // Bus fsm interface
-  input  logic                   CacheBusAck,       // Bus operation completed
-  input  logic                   SelBusBeat,        // Word in cache line comes from BeatCount
-  input  logic [LOGBWPL-1:0]     BeatCount,         // Beat in burst
-  input  logic [LINELEN-1:0]     FetchBuffer,       // Buffer long enough to hold entire cache line arriving from bus
-  output logic [1:0]             CacheBusRW,        // [1] Read or [0] write bus
-  output logic [`PA_BITS-1:0]    CacheBusAdr        // Address for bus access
-);
+  input logic                   clk,
+  input logic                   reset,
+   // cpu side
+  input logic                   FlushStage,
+  input logic                   Stall,
+  input logic [1:0]             CacheRW,
+  input logic [1:0]             CacheAtomic,
+  input logic                   FlushCache,
+  input logic                   InvalidateCache,
+  input logic [11:0]            NextAdr, // virtual address, but we only use the lower 12 bits.
+  input logic [`PA_BITS-1:0]    PAdr, // physical address
+  input logic [(WORDLEN-1)/8:0] ByteMask,
+  input logic [WORDLEN-1:0]     CacheWriteData,
+  output logic                  CacheCommitted,
+  output logic                  CacheStall,
+   // to performance counters to cpu
+  output logic                  CacheMiss,
+  output logic                  CacheAccess,
+   // lsu control
+  input logic                   SelHPTW,
+   // Bus fsm interface
+  output logic [1:0]            CacheBusRW,
+  input logic                   CacheBusAck,
+  input logic                   SelBusBeat, 
+  input logic [LOGBWPL-1:0]     BeatCount,
+  input logic [LINELEN-1:0]     FetchBuffer,
+  output logic [`PA_BITS-1:0]   CacheBusAdr,
+  output logic [WORDLEN-1:0]    ReadDataWord);
 
   // Cache parameters
-  localparam                     LINEBYTELEN = LINELEN/8;            // Line length in bytes
-  localparam                     OFFSETLEN = $clog2(LINEBYTELEN);    // Number of bits in offset field
-  localparam                     SETLEN = $clog2(NUMLINES);          // Number of set bits
-  localparam                     SETTOP = SETLEN+OFFSETLEN;          // Number of set plus offset bits
-  localparam                     TAGLEN = `PA_BITS - SETTOP;         // Number of tag bits
-  localparam                     WORDSPERLINE = LINELEN/WORDLEN;     // Number of words in cache line
-  localparam                     FLUSHADRTHRESHOLD = NUMLINES - 1;   // Used to determine when flush is complete
-  localparam                     LOGLLENBYTES = $clog2(WORDLEN/8);   // Number of bits to address a word
-  localparam                     CACHEWORDSPERLINE = `DCACHE_LINELENINBITS/WORDLEN; // *** see if this is the same as WORDSPERLINE
-  localparam                     LOGCWPL = $clog2(CACHEWORDSPERLINE); // ***
+  localparam                  LINEBYTELEN = LINELEN/8;
+  localparam                  OFFSETLEN = $clog2(LINEBYTELEN);
+  localparam                  SETLEN = $clog2(NUMLINES);
+  localparam                  SETTOP = SETLEN+OFFSETLEN;
+  localparam                  TAGLEN = `PA_BITS - SETTOP;
+  localparam                  WORDSPERLINE = LINELEN/WORDLEN;
+  localparam                  FlushAdrThreshold   = NUMLINES - 1;
 
-  logic                          SelAdr;
-  logic [1:0]                    AdrSelMuxSel;
-  logic [SETLEN-1:0]             CAdr;
-  logic [LINELEN-1:0]            LineWriteData;
-  logic                          ClearValid, ClearDirty, SetDirty, SetValid;
-  logic [LINELEN-1:0]            ReadDataLineWay [NUMWAYS-1:0];
-  logic [NUMWAYS-1:0]            HitWay, ValidWay;
-  logic                          CacheHit;
-  logic [NUMWAYS-1:0]            VictimWay, DirtyWay;
-  logic                          LineDirty;
-  logic [TAGLEN-1:0]             TagWay [NUMWAYS-1:0];
-  logic [TAGLEN-1:0]             Tag;
-  logic [SETLEN-1:0]             FlushAdr, NextFlushAdr, FlushAdrP1;
-  logic                          FlushAdrCntEn, FlushCntRst;
-  logic                          FlushAdrFlag, FlushWayFlag;
-  logic [NUMWAYS-1:0]            FlushWay, NextFlushWay;
-  logic                          FlushWayCntEn;
-  logic                          SelWriteback;
-  logic                          LRUWriteEn;
-  logic                          SelFlush;
-  logic                          ResetOrFlushCntRst;
-  logic [LINELEN-1:0]            ReadDataLine, ReadDataLineCache;
-  logic                          SelFetchBuffer;
-  logic                          CacheEn;
-  logic [CACHEWORDSPERLINE-1:0]  MemPAdrDecoded;
-  logic [LINELEN/8-1:0]          LineByteMask, DemuxedByteMask, FetchBufferByteSel;
+  logic                       SelAdr;
+  logic [SETLEN-1:0]          CAdr;
+  logic [LINELEN-1:0]         LineWriteData;
+  logic                       ClearValid;
+  logic                       ClearDirty;
+  logic [LINELEN-1:0]         ReadDataLineWay [NUMWAYS-1:0];
+  logic [NUMWAYS-1:0]         HitWay, ValidWay;
+  logic                       CacheHit;
+  logic                       SetDirty;
+  logic                       SetValid;
+  logic [NUMWAYS-1:0]         VictimWay;
+  logic [NUMWAYS-1:0]         DirtyWay;
+  logic                       LineDirty;
+  logic [TAGLEN-1:0]          TagWay [NUMWAYS-1:0];
+  logic [TAGLEN-1:0]          Tag;
+  logic [SETLEN-1:0]          FlushAdr;
+  logic [SETLEN-1:0]          NextFlushAdr;
+  logic [SETLEN-1:0]          FlushAdrP1;
+  logic                       FlushAdrCntEn;
+  logic                       FlushCntRst;
+  logic                       FlushAdrFlag;
+  logic                       FlushWayFlag;
+  logic [NUMWAYS-1:0]         FlushWay;
+  logic [NUMWAYS-1:0]         NextFlushWay;
+  logic                       FlushWayCntEn;
+  logic                       SelWriteback;
+  logic                       LRUWriteEn;
+  logic                       SelFlush;
+  logic                       ResetOrFlushCntRst;
+  logic [LINELEN-1:0]         ReadDataLine, ReadDataLineCache;
   logic [$clog2(LINELEN/8) - $clog2(MUXINTERVAL/8) - 1:0]          WordOffsetAddr;
+  logic                       SelFetchBuffer;
+  logic                       CacheEn;
   
-  genvar                         index;
+
+  localparam                  LOGLLENBYTES = $clog2(WORDLEN/8);
+  localparam                  CACHEWORDSPERLINE = `DCACHE_LINELENINBITS/WORDLEN;
+  localparam                  LOGCWPL = $clog2(CACHEWORDSPERLINE);
+  logic [CACHEWORDSPERLINE-1:0] MemPAdrDecoded;
+  logic [LINELEN/8-1:0]       LineByteMask, DemuxedByteMask, FetchBufferByteSel;
+  genvar                      index;
   
   /////////////////////////////////////////////////////////////////////////////////////////////
   // Read Path
@@ -107,100 +119,91 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGBWPL, WORDLEN, MUXINTE
   // and FlushAdr when handling D$ flushes
   // The icache must update to the newest PCNextF on flush as it is probably a trap.  Trap
   // sets PCNextF to XTVEC and the icache must start reading the instruction.
-  assign AdrSelMuxSel = {SelFlush, ((SelAdr | SelHPTW) & ~((DCACHE == 0) & FlushStage))};
-  mux3 #(SETLEN) AdrSelMux(.d0(NextAdr[SETTOP-1:OFFSETLEN]), .d1(PAdr[SETTOP-1:OFFSETLEN]), .d2(FlushAdr),
-    .s(AdrSelMuxSel), .y(CAdr));
+  mux3 #(SETLEN) AdrSelMux(
+    .d0(NextAdr[SETTOP-1:OFFSETLEN]), .d1(PAdr[SETTOP-1:OFFSETLEN]), .d2(FlushAdr),
+    .s({SelFlush, ((SelAdr | SelHPTW) & ~((DCACHE == 0) & FlushStage))}), .y(CAdr));
 
   // Array of cache ways, along with victim, hit, dirty, and read merging logic
-  cacheway #(NUMLINES, LINELEN, TAGLEN, OFFSETLEN, SETLEN, DCACHE) CacheWays[NUMWAYS-1:0](
-    .clk, .reset, .CacheEn, .CAdr, .PAdr, .LineWriteData, .LineByteMask,
+  cacheway #(NUMLINES, LINELEN, TAGLEN, OFFSETLEN, SETLEN, DCACHE) 
+    CacheWays[NUMWAYS-1:0](.clk, .reset, .CacheEn, .CAdr, .PAdr, .LineWriteData, .LineByteMask,
     .SetValid, .ClearValid, .SetDirty, .ClearDirty, .SelWriteback, .VictimWay,
     .FlushWay, .SelFlush, .ReadDataLineWay, .HitWay, .ValidWay, .DirtyWay, .TagWay, .FlushStage, .InvalidateCache);
-
-  // Select victim way for associative caches
   if(NUMWAYS > 1) begin:vict
     cacheLRU #(NUMWAYS, SETLEN, OFFSETLEN, NUMLINES) cacheLRU(
       .clk, .reset, .CacheEn, .FlushStage, .HitWay, .ValidWay, .VictimWay, .CAdr, .LRUWriteEn(LRUWriteEn & ~FlushStage),
       .SetValid, .PAdr(PAdr[SETTOP-1:OFFSETLEN]), .InvalidateCache, .FlushCache);
-  end else 
-    assign VictimWay = 1'b1; // one hot.
-
-  assign CacheHit = |HitWay;
-  assign LineDirty = |DirtyWay;
-
+  end else assign VictimWay = 1'b1; // one hot.
+  assign CacheHit = | HitWay;
+  assign LineDirty = | DirtyWay;
   // ReadDataLineWay is a 2d array of cache line len by number of ways.
   // Need to OR together each way in a bitwise manner.
   // Final part of the AO Mux.  First is the AND in the cacheway.
   or_rows #(NUMWAYS, LINELEN) ReadDataAOMux(.a(ReadDataLineWay), .y(ReadDataLineCache));
   or_rows #(NUMWAYS, TAGLEN) TagAOMux(.a(TagWay), .y(Tag));
 
-  // Data cache needs to choose word offset from PAdr or BeatCount to writeback dirty lines
+  // like to fix this.
   if(DCACHE) 
     mux2 #(LOGBWPL) WordAdrrMux(.d0(PAdr[$clog2(LINELEN/8) - 1 : $clog2(MUXINTERVAL/8)]), 
       .d1(BeatCount), .s(SelBusBeat),
       .y(WordOffsetAddr)); 
-  else 
-    assign WordOffsetAddr = PAdr[$clog2(LINELEN/8) - 1 : $clog2(MUXINTERVAL/8)];
+  else assign WordOffsetAddr = PAdr[$clog2(LINELEN/8) - 1 : $clog2(MUXINTERVAL/8)];
   
-  // Bypass cache array to save a cycle when finishing a load miss
   mux2 #(LINELEN) EarlyReturnMux(ReadDataLineCache, FetchBuffer, SelFetchBuffer, ReadDataLine);
 
-  // Select word from cache line
   subcachelineread #(LINELEN, WORDLEN, MUXINTERVAL) subcachelineread(
-    .PAdr(WordOffsetAddr), .ReadDataLine, .ReadDataWord);
+    .PAdr(WordOffsetAddr),
+    .ReadDataLine, .ReadDataWord);
   
-   // Bus address for fetch, writeback, or flush writeback
-  mux3 #(`PA_BITS) CacheBusAdrMux(.d0({PAdr[`PA_BITS-1:OFFSETLEN], {OFFSETLEN{1'b0}}}),
-		.d1({Tag, PAdr[SETTOP-1:OFFSETLEN], {OFFSETLEN{1'b0}}}),
-		.d2({Tag, FlushAdr, {OFFSETLEN{1'b0}}}),
-		.s({SelFlush, SelWriteback}), .y(CacheBusAdr));
-
   /////////////////////////////////////////////////////////////////////////////////////////////
-  // Write Path
+  // Write Path: Write data and address. Muxes between writes from bus and writes from CPU.
   /////////////////////////////////////////////////////////////////////////////////////////////
-
-  // Adjust byte mask from word to cache line
-  onehotdecoder #(LOGCWPL) adrdec(.bin(PAdr[LOGCWPL+LOGLLENBYTES-1:LOGLLENBYTES]), .decoded(MemPAdrDecoded));
+  onehotdecoder #(LOGCWPL) adrdec(
+    .bin(PAdr[LOGCWPL+LOGLLENBYTES-1:LOGLLENBYTES]), .decoded(MemPAdrDecoded));
   for(index = 0; index < 2**LOGCWPL; index++) begin
     assign DemuxedByteMask[(index+1)*(WORDLEN/8)-1:index*(WORDLEN/8)] = MemPAdrDecoded[index] ? ByteMask : '0;
   end
+
   assign FetchBufferByteSel = SetValid & ~SetDirty ? '1 : ~DemuxedByteMask;  // If load miss set all muxes to 1.
+  logic [LINELEN/8-1:0]       LineByteMask2;
   assign LineByteMask = SetValid ? '1 : SetDirty ? DemuxedByteMask : '0;
 
-  // Merge write data into fetched cache line for store miss
   for(index = 0; index < LINELEN/8; index++) begin
     mux2 #(8) WriteDataMux(.d0(CacheWriteData[(8*index)%WORDLEN+7:(8*index)%WORDLEN]),
       .d1(FetchBuffer[8*index+7:8*index]), .s(FetchBufferByteSel[index]), .y(LineWriteData[8*index+7:8*index]));
   end
-   
-  /////////////////////////////////////////////////////////////////////////////////////////////
-  // Flush logic
-  /////////////////////////////////////////////////////////////////////////////////////////////
 
-  // Flush address (line number)
+  mux3 #(`PA_BITS) CacheBusAdrMux(.d0({PAdr[`PA_BITS-1:OFFSETLEN], {OFFSETLEN{1'b0}}}),
+		.d1({Tag, PAdr[SETTOP-1:OFFSETLEN], {OFFSETLEN{1'b0}}}),
+		.d2({Tag, FlushAdr, {OFFSETLEN{1'b0}}}),
+		.s({SelFlush, SelWriteback}), .y(CacheBusAdr));
+  
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  // Flush address and way generation during flush
+  /////////////////////////////////////////////////////////////////////////////////////////////
   assign ResetOrFlushCntRst = reset | FlushCntRst;
-  flopenr #(SETLEN) FlushAdrReg(clk, ResetOrFlushCntRst, FlushAdrCntEn, FlushAdrP1, NextFlushAdr);
-  mux2    #(SETLEN) FlushAdrMux(NextFlushAdr, FlushAdrP1, FlushAdrCntEn, FlushAdr);
+  flopenr #(SETLEN) FlushAdrReg(.clk, .reset(ResetOrFlushCntRst), .en(FlushAdrCntEn), 
+    .d(FlushAdrP1), .q(NextFlushAdr));
+  assign FlushAdr = FlushAdrCntEn ? FlushAdrP1 : NextFlushAdr;
   assign FlushAdrP1 = NextFlushAdr + 1'b1;
-  assign FlushAdrFlag = (NextFlushAdr == FLUSHADRTHRESHOLD[SETLEN-1:0]);
-
-  // Flush way
-  flopenl #(NUMWAYS) FlushWayReg(clk, ResetOrFlushCntRst, FlushWayCntEn, {{NUMWAYS-1{1'b0}}, 1'b1}, NextFlushWay, FlushWay);
-  if(NUMWAYS > 1) assign NextFlushWay = {FlushWay[NUMWAYS-2:0], FlushWay[NUMWAYS-1]};
-  else            assign NextFlushWay = FlushWay[NUMWAYS-1];
+  assign FlushAdrFlag = (NextFlushAdr == FlushAdrThreshold[SETLEN-1:0]);
+  flopenl #(NUMWAYS) FlushWayReg(.clk, .load(ResetOrFlushCntRst), .en(FlushWayCntEn), 
+    .val({{NUMWAYS-1{1'b0}}, 1'b1}), .d(NextFlushWay), .q(FlushWay));
   assign FlushWayFlag = FlushWay[NUMWAYS-1];
+  if(NUMWAYS > 1) assign NextFlushWay = {FlushWay[NUMWAYS-2:0], FlushWay[NUMWAYS-1]};
+  else assign NextFlushWay = FlushWay[NUMWAYS-1];
 
   /////////////////////////////////////////////////////////////////////////////////////////////
   // Cache FSM
   /////////////////////////////////////////////////////////////////////////////////////////////
-
   cachefsm cachefsm(.clk, .reset, .CacheBusRW, .CacheBusAck, 
 		.FlushStage, .CacheRW, .CacheAtomic, .Stall,
  		.CacheHit, .LineDirty, .CacheStall, .CacheCommitted, 
 		.CacheMiss, .CacheAccess, .SelAdr, 
-		.ClearValid, .ClearDirty, .SetDirty, .SetValid, .SelWriteback, .SelFlush,
+		.ClearValid, .ClearDirty, .SetDirty,
+		.SetValid, .SelWriteback, .SelFlush,
 		.FlushAdrCntEn, .FlushWayCntEn, .FlushCntRst,
 		.FlushAdrFlag, .FlushWayFlag, .FlushCache, .SelFetchBuffer,
-    .InvalidateCache, .CacheEn, .LRUWriteEn);
-
+        .InvalidateCache,
+        .CacheEn,
+        .LRUWriteEn);
 endmodule 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
index 80cd4e3ab..b4c4964d2 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -42,6 +42,7 @@ module fdivsqrt(
   input  logic XNaNE, YNaNE, 
   input  logic FDivStartE, IDivStartE,
   input  logic StallM,
+  input  logic StallE,
   input  logic FlushE,
   input  logic SqrtE, SqrtM,
 	input  logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
@@ -74,17 +75,17 @@ module fdivsqrt(
     .clk, .IFDivStartE, .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), 
     .Sqrt(SqrtE), .Ym(YmE), .XZeroE, .X, .DPreproc, .ForwardedSrcAM,
     .nE, .nM, .mM, .CalcOTFCSwapE, .OTFCSwapE, .ALTBM, .AZeroM, .BZeroM, .AZeroE, .BZeroE, .As,
-    .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .MDUE, .W64E);
+    .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E);
   fdivsqrtfsm fdivsqrtfsm(
     .clk, .reset, .FmtE, .XsE, .SqrtE, .nE,
-    .FDivBusyE, .FDivStartE, .IDivStartE, .IFDivStartE, .FDivDoneE, .StallM, .FlushE, /*.DivDone, */ 
+    .FDivBusyE, .FDivStartE, .IDivStartE, .IFDivStartE, .FDivDoneE, .StallE, .StallM, .FlushE, /*.DivDone, */ 
     .XZeroE, .YZeroE, .AZeroE, .BZeroE,
     .XNaNE, .YNaNE, .MDUE,
     .XInfE, .YInfE, .WZeroM, .SpecialCaseM);
   fdivsqrtiter fdivsqrtiter(
     .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .MDUE, .SqrtE, // .SqrtM,
     .X,.DPreproc, .FirstWS(WS), .FirstWC(WC),
-    .IFDivStartE, .CalcOTFCSwapE, .OTFCSwapE,
+    .IFDivStartE, .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE, .CalcOTFCSwapE, .OTFCSwapE,
     .FDivBusyE);
   fdivsqrtpostproc fdivsqrtpostproc(
     .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, .Firstun, 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index 430f79a2a..851dc27a5 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -41,6 +41,7 @@ module fdivsqrtfsm(
   input  logic FDivStartE, IDivStartE,
   input  logic XsE,
   input  logic SqrtE,
+  input  logic StallE,
   input  logic StallM,
   input  logic FlushE,
   input  logic WZeroM,
@@ -116,9 +117,9 @@ module fdivsqrtfsm(
           if (SpecialCaseE) state <= #1 DONE;
           else             state <= #1 BUSY;
       end else if (state == BUSY) begin
-          if (step == 1 | WZeroM)  state <= #1 DONE; // terminate early when residual is zero
+          if (step == 1)  state <= #1 DONE;
           step <= step - 1;
-      end else if ((state == DONE)) begin
+      end else if ((state == DONE) | (WZeroM & (state == BUSY))) begin
         if (StallM) state <= #1 DONE;
         else        state <= #1 IDLE;
       end 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
index b91728eac..75145e55a 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -34,6 +34,8 @@ module fdivsqrtiter(
   input  logic clk,
   input  logic IFDivStartE, 
   input  logic FDivBusyE, 
+  input  logic [`NE-1:0] Xe, Ye,
+  input  logic XZeroE, YZeroE, 
   input  logic SqrtE, MDUE,
 //  input  logic SqrtM,
   input  logic CalcOTFCSwapE, OTFCSwapE,
@@ -62,6 +64,7 @@ module fdivsqrtiter(
   logic [`DIVb+3:0]      WSN, WCN;               // Q4.b
   logic [`DIVb+3:0]      DBar, D2, DBar2;        // Q4.b
   logic [`DIVb+1:0]      NextC;
+  logic [`DIVb+1:0]      CMux;
   logic [`DIVb:0]        UMux, UMMux;
   logic [`DIVb:0]        initU, initUM;
   /* verilator lint_on UNOPTFLAT */
@@ -91,8 +94,8 @@ module fdivsqrtiter(
   logic [1:0] initCUpper;
   assign initCUpper = (SqrtE & ~(MDUE)) ? 2'b11 : (`RADIX == 4) ? 2'b00 : 2'b10;
   assign initC = {initCUpper, {`DIVb{1'b0}}};
-  mux2 #(`DIVb+2) Cmux(C[`DIVCOPIES], initC, IFDivStartE, NextC); 
-  flopen #(`DIVb+2) creg(clk, IFDivStartE|FDivBusyE, NextC, C[0]);
+  mux2 #(`DIVb+2) Cmux(C[`DIVCOPIES], initC, IFDivStartE, CMux); 
+  flopen #(`DIVb+2) creg(clk, IFDivStartE|FDivBusyE, CMux, C[0]);
 
    // Divisior register
   flopen #(`DIVb) dreg(clk, IFDivStartE, DPreproc, D);
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 24441edab..5f9142982 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -123,7 +123,7 @@ module fdivsqrtpostproc(
       IntRemM  = NormRemM;
     end 
   
-  always_comb  // could merge into postprocessor shifter
+  always_comb
     if (RemOpM) begin
       NormShiftM = (mM + (`DIVBLEN+1)'(`DIVa));
       PreResultM = IntRemM;
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 902b02760..0bd3fae07 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -39,7 +39,7 @@ module fdivsqrtpreproc (
   input  logic Sqrt,
   input  logic XZeroE,
   input  logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
-	input  logic [2:0] 	Funct3E,
+	input  logic [2:0] 	Funct3E, Funct3M,
 	input  logic MDUE, W64E,
   output logic [`DIVBLEN:0] nE, nM, mM,
   output logic CalcOTFCSwapE, OTFCSwapE, ALTBM, As, AZeroM, BZeroM, AZeroE, BZeroE,
diff --git a/pipelined/src/fpu/fma/fma.sv b/pipelined/src/fpu/fma/fma.sv
index 0b8977cd0..ce42aaa19 100644
--- a/pipelined/src/fpu/fma/fma.sv
+++ b/pipelined/src/fpu/fma/fma.sv
@@ -84,7 +84,7 @@ module fma(
     // // Addition/LZA
     // ///////////////////////////////////////////////////////////////////////////////
         
-    fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .KillProd, .ZmSticky, .AmInv, .PmKilled, .InvA, .Sm, .Se, .Ss);
+    fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .InvA, .Sm, .Se, .Ss);
 
     fmalza #(3*`NF+6) lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
 endmodule
diff --git a/pipelined/src/fpu/fma/fmaadd.sv b/pipelined/src/fpu/fma/fmaadd.sv
index cf0423d3e..16cf6b05e 100644
--- a/pipelined/src/fpu/fma/fmaadd.sv
+++ b/pipelined/src/fpu/fma/fmaadd.sv
@@ -33,8 +33,8 @@
 module fmaadd(
     input logic  [3*`NF+5:0]    Am, // aligned addend's mantissa for addition in U(NF+5.2NF+1)
     input logic  [2*`NF+1:0]    Pm,       // the product's mantissa
-    input logic                 Ps,// the product sign and the alligend addeded's sign (Modified Z sign for other opperations)
-    input logic                 InvA,          // invert the aligned addend
+    input logic                 Ps, As,// the product sign and the alligend addeded's sign (Modified Z sign for other opperations)
+    input logic                InvA,          // invert the aligned addend
     input logic                 KillProd,      // should the product be set to 0
     input logic                 ZmSticky,
     input logic  [`NE-1:0]      Ze,
diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index 10c9bd771..afa645cde 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -91,6 +91,7 @@ module fpu (
    logic 		      XsE, YsE, ZsE;                // input's sign - execute stage
    logic 		      XsM, YsM;                       // input's sign - memory stage
    logic [`NE-1:0] 	XeE, YeE, ZeE;                // input's exponent - execute stage
+   logic [`NE-1:0] 	ZeM;                              // input's exponent - memory stage
    logic [`NF:0] 	   XmE, YmE, ZmE;                // input's fraction - execute stage
    logic [`NF:0] 	   XmM, YmM, ZmM;                // input's fraction - memory stage
    logic 		      XNaNE, YNaNE, ZNaNE;                // is the input a NaN - execute stage
@@ -264,7 +265,7 @@ module fpu (
    fdivsqrt fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
                   .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .FDivStartE, .IDivStartE, .XsE,
                   .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E,
-                  .StallM, .FlushE, .DivSM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .QeM, 
+                  .StallE, .StallM, .FlushE, .DivSM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .QeM, 
                   .QmM, .FPIntDivResultM /*, .DivDone(DivDoneM) */);
 
                   //
@@ -342,6 +343,7 @@ module fpu (
 
    flopenrc #(`NF+1) EMFpReg2 (clk, reset, FlushM, ~StallM, XmE, XmM);
    flopenrc #(`NF+1) EMFpReg3 (clk, reset, FlushM, ~StallM, YmE, YmM);
+   flopenrc #(`FLEN) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZeE,ZmE}, {ZeM,ZmM});
    flopenrc #(`XLEN) EMFpReg6 (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
    flopenrc #(`FLEN) EMFpReg7 (clk, reset, FlushM, ~StallM, PreFpResE, PreFpResM);
    flopenr #(15) EMFpReg5 (clk, reset, ~StallUnpackedM, 
@@ -370,7 +372,7 @@ module fpu (
 
    assign FpLoadStoreM = FResSelM[1];
 
-   postprocess postprocess(.Xs(XsM), .Ys(YsM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), 
+   postprocess postprocess(.Xs(XsM), .Ys(YsM), .Ze(ZeM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), 
                            .FmaZmS(ZmStickyM), .XZero(XZeroM), .YZero(YZeroM), .ZZero(ZZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
                            .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), .FmaSm(SmM), .DivQe(QeM), /*.DivDone(DivDoneM), */
                            .ZDenorm(ZDenormM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
diff --git a/pipelined/src/fpu/postproc/divshiftcalc.sv b/pipelined/src/fpu/postproc/divshiftcalc.sv
index 392f8db44..58fd9b9b6 100644
--- a/pipelined/src/fpu/postproc/divshiftcalc.sv
+++ b/pipelined/src/fpu/postproc/divshiftcalc.sv
@@ -32,7 +32,8 @@
 
 module divshiftcalc(
     input logic  [`DIVb:0] DivQm,
-    input logic Sqrt, // *** not used right now.  Maybe merge with shift from postprocess
+    input logic  [`FMTBITS-1:0] Fmt,
+    input logic Sqrt,
     input logic [`NE+1:0] DivQe,
     output logic [`LOGNORMSHIFTSZ-1:0] DivShiftAmt,
     output logic [`NORMSHIFTSZ-1:0] DivShiftIn,
diff --git a/pipelined/src/fpu/postproc/fmashiftcalc.sv b/pipelined/src/fpu/postproc/fmashiftcalc.sv
index 1e8012784..ce9ff79b1 100644
--- a/pipelined/src/fpu/postproc/fmashiftcalc.sv
+++ b/pipelined/src/fpu/postproc/fmashiftcalc.sv
@@ -31,6 +31,7 @@
 
 module fmashiftcalc(
     input logic  [3*`NF+5:0]            FmaSm,       // the positive sum
+    input logic  [`NE-1:0]              Ze,      // exponent of Z
     input logic  [$clog2(3*`NF+7)-1:0]  FmaSCnt,   // normalization shift count
     input logic  [`FMTBITS-1:0]         Fmt,       // precision 1 = double 0 = single
     input logic [`NE+1:0] FmaSe,
diff --git a/pipelined/src/fpu/postproc/postprocess.sv b/pipelined/src/fpu/postproc/postprocess.sv
index 368f3ef77..0880d33e2 100644
--- a/pipelined/src/fpu/postproc/postprocess.sv
+++ b/pipelined/src/fpu/postproc/postprocess.sv
@@ -33,6 +33,7 @@
 module postprocess (
     // general signals
     input logic                             Xs, Ys,  // input signs
+    input logic  [`NE-1:0]                  Ze, // input exponents
     input logic  [`NF:0]                    Xm, Ym, Zm, // input mantissas
     input logic  [2:0]                      Frm,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
     input logic  [`FMTBITS-1:0]             Fmt,       // precision 1 = double 0 = single
@@ -145,9 +146,9 @@ module postprocess (
 
     cvtshiftcalc cvtshiftcalc(.ToInt, .CvtCe, .CvtResDenormUf, .Xm, .CvtLzcIn,  
                               .XZero, .IntToFp, .OutFmt, .CvtResUf, .CvtShiftIn);
-    fmashiftcalc fmashiftcalc(.FmaSm, .FmaSCnt, .Fmt, .NormSumExp, .FmaSe,
+    fmashiftcalc fmashiftcalc(.FmaSm, .Ze, .FmaSCnt, .Fmt, .NormSumExp, .FmaSe,
                           .FmaSZero, .FmaPreResultDenorm, .FmaShiftAmt, .FmaShiftIn);
-    divshiftcalc divshiftcalc(.Sqrt, .DivQe, .DivQm, .DivResDenorm, .DivDenormShiftPos, .DivShiftAmt, .DivShiftIn);
+    divshiftcalc divshiftcalc(.Fmt, .Sqrt, .DivQe, .DivQm, .DivResDenorm, .DivDenormShiftPos, .DivShiftAmt, .DivShiftIn);
 
     always_comb
         case(PostProcSel)
diff --git a/pipelined/src/privileged/csr.sv b/pipelined/src/privileged/csr.sv
index 802b35762..dbb852e84 100644
--- a/pipelined/src/privileged/csr.sv
+++ b/pipelined/src/privileged/csr.sv
@@ -199,7 +199,7 @@ module csr #(parameter
   // CSRs
   ///////////////////////////////////////////
 
-  csri   csri(.clk, .reset, .InstrValidNotFlushedM, 
+  csri   csri(.clk, .reset, .InstrValidNotFlushedM, .StallW, 
               .CSRMWriteM, .CSRSWriteM, .CSRWriteValM, .CSRAdrM, 
               .MExtInt, .SExtInt, .MTimerInt, .MSwInt,
               .MIP_REGW, .MIE_REGW, .MIP_REGW_writeable);
@@ -219,7 +219,7 @@ module csr #(parameter
               .CSRAdrM, .PrivilegeModeW, .CSRWriteValM,
               .MCOUNTINHIBIT_REGW, .MCOUNTEREN_REGW, .SCOUNTEREN_REGW,
               .MTIME_CLINT,  .CSRCReadValM, .IllegalCSRCAccessM);
-  csrm  csrm(.clk, .reset, .InstrValidNotFlushedM,
+  csrm  csrm(.clk, .reset, .InstrValidNotFlushedM, .StallW,
               .CSRMWriteM, .MTrapM, .CSRAdrM,
               .NextEPCM, .NextCauseM, .NextMtvalM, .MSTATUS_REGW, .MSTATUSH_REGW,
               .CSRWriteValM, .CSRMReadValM, .MTVEC_REGW,
@@ -227,7 +227,7 @@ module csr #(parameter
               .MEDELEG_REGW, .MIDELEG_REGW,.PMPCFG_ARRAY_REGW, .PMPADDR_ARRAY_REGW,
               .MIP_REGW, .MIE_REGW, .WriteMSTATUSM, .WriteMSTATUSHM,
               .IllegalCSRMAccessM, .IllegalCSRMWriteReadonlyM);
-  csrs  csrs(.clk, .reset,  .InstrValidNotFlushedM,
+  csrs  csrs(.clk, .reset,  .InstrValidNotFlushedM, .StallW,
               .CSRSWriteM, .STrapM, .CSRAdrM,
               .NextEPCM, .NextCauseM, .NextMtvalM, .SSTATUS_REGW, 
               .STATUS_TVM, .CSRWriteValM, .PrivilegeModeW,
@@ -235,7 +235,7 @@ module csr #(parameter
               .SCOUNTEREN_REGW,
               .SATP_REGW, .MIP_REGW, .MIE_REGW, .MIDELEG_REGW,
               .WriteSSTATUSM, .IllegalCSRSAccessM);
-  csru  csru(.clk, .reset, .InstrValidNotFlushedM,
+  csru  csru(.clk, .reset, .InstrValidNotFlushedM, .StallW,
               .CSRUWriteM, .CSRAdrM, .CSRWriteValM, .STATUS_FS, .CSRUReadValM,  
               .SetFflagsM, .FRM_REGW, .WriteFRMM, .WriteFFLAGSM,
               .IllegalCSRUAccessM);
diff --git a/pipelined/src/privileged/csri.sv b/pipelined/src/privileged/csri.sv
index a145802f0..aa4de62af 100644
--- a/pipelined/src/privileged/csri.sv
+++ b/pipelined/src/privileged/csri.sv
@@ -38,7 +38,7 @@ module csri #(parameter
     SIP = 12'h144
   ) (
     input logic 			clk, reset, 
-    input logic 			InstrValidNotFlushedM, 
+    input logic 			InstrValidNotFlushedM, StallW,
     input logic 			CSRMWriteM, CSRSWriteM,
     input logic [`XLEN-1:0] CSRWriteValM,
     input logic [11:0] 		CSRAdrM,
diff --git a/pipelined/src/privileged/csrm.sv b/pipelined/src/privileged/csrm.sv
index 71368d065..3a8e73ee6 100644
--- a/pipelined/src/privileged/csrm.sv
+++ b/pipelined/src/privileged/csrm.sv
@@ -72,7 +72,7 @@ module csrm #(parameter
     MIDELEG_MASK = 12'h222 // we choose to not make machine interrupts delegable
   ) (
     input logic 	     clk, reset, 
-    input logic 	     InstrValidNotFlushedM, 
+    input logic 	     InstrValidNotFlushedM, StallW,
     input logic 	     CSRMWriteM, MTrapM,
     input logic [11:0] 	     CSRAdrM,
     input logic [`XLEN-1:0]  NextEPCM, NextCauseM, NextMtvalM, MSTATUS_REGW, MSTATUSH_REGW,
diff --git a/pipelined/src/privileged/csrs.sv b/pipelined/src/privileged/csrs.sv
index b43067387..7d3aeeb94 100644
--- a/pipelined/src/privileged/csrs.sv
+++ b/pipelined/src/privileged/csrs.sv
@@ -50,7 +50,7 @@ module csrs #(parameter
 
   ) (
     input logic 	     clk, reset, 
-    input logic 	     InstrValidNotFlushedM, 
+    input logic 	     InstrValidNotFlushedM, StallW,
     input logic 	     CSRSWriteM, STrapM,
     input logic [11:0] 	     CSRAdrM,
     input logic [`XLEN-1:0]  NextEPCM, NextCauseM, NextMtvalM, SSTATUS_REGW, 
diff --git a/pipelined/src/privileged/csrsr.sv b/pipelined/src/privileged/csrsr.sv
index f6fa38183..c4f841959 100644
--- a/pipelined/src/privileged/csrsr.sv
+++ b/pipelined/src/privileged/csrsr.sv
@@ -32,8 +32,7 @@
 `include "wally-config.vh"
 
 module csrsr (
-  input  logic             clk, reset, 
-  input  logic             StallW,
+  input  logic             clk, reset, StallW,
   input  logic             WriteMSTATUSM, WriteMSTATUSHM, WriteSSTATUSM, 
   input  logic             TrapM, FRegWriteM,
   input  logic [1:0]       NextPrivilegeModeM, PrivilegeModeW,
diff --git a/pipelined/src/privileged/csru.sv b/pipelined/src/privileged/csru.sv
index c1eea42c3..7d1c5cbe5 100644
--- a/pipelined/src/privileged/csru.sv
+++ b/pipelined/src/privileged/csru.sv
@@ -37,7 +37,7 @@ module csru #(parameter
   FRM = 12'h002,
   FCSR = 12'h003) (
     input  logic             clk, reset, 
-    input  logic             InstrValidNotFlushedM,
+    input  logic             InstrValidNotFlushedM, StallW,
     input  logic             CSRUWriteM,
     input  logic [11:0]      CSRAdrM,
     input  logic [`XLEN-1:0] CSRWriteValM,
diff --git a/pipelined/src/privileged/trap.sv b/pipelined/src/privileged/trap.sv
index 25936932b..ec3cc8634 100644
--- a/pipelined/src/privileged/trap.sv
+++ b/pipelined/src/privileged/trap.sv
@@ -63,12 +63,12 @@ module trap (
   ///////////////////////////////////////////
   assign MIntGlobalEnM = (PrivilegeModeW != `M_MODE) | STATUS_MIE; // if M ints enabled or lower priv 3.1.9
   assign SIntGlobalEnM = (PrivilegeModeW == `U_MODE) | ((PrivilegeModeW == `S_MODE) & STATUS_SIE); // if in lower priv mode, or if S ints enabled and not in higher priv mode 3.1.9
-  assign Committed = CommittedM | CommittedF;
-  assign EnabledIntsM = {12{~Committed & InstrValidM}} & ({12{MIntGlobalEnM}} & ~MIDELEG_REGW | {12{SIntGlobalEnM}} & MIDELEG_REGW);
   assign PendingIntsM = MIP_REGW & MIE_REGW;
   assign IntPendingM = |PendingIntsM;
-  assign ValidIntsM = PendingIntsM & EnabledIntsM;
-  assign InterruptM = (|ValidIntsM) ; // suppress interrupt if the memory system has partially processed a request.
+  assign Committed = CommittedM | CommittedF;
+  assign EnabledIntsM = ({12{MIntGlobalEnM}} & PendingIntsM & ~MIDELEG_REGW | {12{SIntGlobalEnM}} & PendingIntsM & MIDELEG_REGW);
+  assign ValidIntsM = {12{~Committed}} & EnabledIntsM;
+  assign InterruptM = (|ValidIntsM) & InstrValidM; // suppress interrupt if the memory system has partially processed a request.
   assign DelegateM = `S_SUPPORTED & (InterruptM ? MIDELEG_REGW[CauseM[3:0]] : MEDELEG_REGW[CauseM]) & 
                      (PrivilegeModeW == `U_MODE | PrivilegeModeW == `S_MODE);