From b3e1badd3184b3e1a3c374f2fc2541054d80486a Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Thu, 12 Jan 2023 07:15:14 -0800
Subject: [PATCH] MDU comment cleanup

---
 pipelined/src/mdu/intdivrestoring.sv     | 55 +++++++------
 pipelined/src/mdu/intdivrestoringstep.sv | 17 ++--
 pipelined/src/mdu/mdu.sv                 | 61 +++++++--------
 pipelined/src/mdu/mul.sv                 | 99 ++++++++++++------------
 4 files changed, 121 insertions(+), 111 deletions(-)

diff --git a/pipelined/src/mdu/intdivrestoring.sv b/pipelined/src/mdu/intdivrestoring.sv
index dab3c2d97..0d95233ca 100644
--- a/pipelined/src/mdu/intdivrestoring.sv
+++ b/pipelined/src/mdu/intdivrestoring.sv
@@ -6,6 +6,8 @@
 //
 // Purpose: Restoring integer division using a shift register and subtractor
 // 
+// Documentation: RISC-V System on Chip Design Chapter 12 (Figure 12.19)
+//
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
@@ -26,32 +28,39 @@
 
 `include "wally-config.vh"
 
-  /* verilator lint_off UNOPTFLAT */
-
-module intdivrestoring (
-  input  logic clk,
-  input  logic reset,
-  input  logic StallM,
-  input  logic FlushE,
-  input  logic DivSignedE, W64E,
-  input  logic IntDivE,
-  //input logic [`XLEN-1:0] 	SrcAE, SrcBE,
-	input logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
-  output logic DivBusyE, 
-  output logic [`XLEN-1:0] QuotM, RemM
+module intdivrestoring(
+  input  logic             clk,
+  input  logic             reset,
+  input  logic             StallM,
+  input  logic             FlushE,
+  input  logic             IntDivE,                       // integer division/remainder instruction of any type
+  input  logic             DivSignedE,                    // signed division 
+  input  logic             W64E,                          // W-type instructions (divw, divuw, remw, remuw)
+	input  logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // Forwarding mux outputs for Source A and B
+  output logic             DivBusyE,                      // Divide is busy - stall pipeline
+  output logic [`XLEN-1:0] QuotM, RemM                    // Quotient and remainder outputs
  );
 
-  typedef enum logic [1:0] {IDLE, BUSY, DONE} statetype;
+  localparam STEPBITS = $clog2(`XLEN/`IDIV_BITSPERCYCLE); // Number of steps
+
+  typedef enum logic [1:0] {IDLE, BUSY, DONE} statetype;  // division FSM state
   statetype state;
 
-  logic [`XLEN-1:0] W[`IDIV_BITSPERCYCLE:0];
-  logic [`XLEN-1:0] XQ[`IDIV_BITSPERCYCLE:0];
-  logic [`XLEN-1:0] DinE, XinE, DnE, DAbsBE, DAbsB, XnE, XInitE, WnM, XQnM;
-  localparam STEPBITS = $clog2(`XLEN/`IDIV_BITSPERCYCLE);
-  logic [STEPBITS:0] step;
-  logic Div0E, Div0M;
-  logic DivStartE, SignXE, SignDE, NegQE, NegWM, NegQM;
-  logic [`XLEN-1:0] WNext, XQNext;
+  logic [`XLEN-1:0]   W[`IDIV_BITSPERCYCLE:0];            // Residual for each of k steps
+  logic [`XLEN-1:0]   XQ[`IDIV_BITSPERCYCLE:0];           // dividend/quotient for each of k steps
+  logic [`XLEN-1:0]   WNext, XQNext;                      // initialized W and XQ going into registers
+  logic [`XLEN-1:0]   DinE, XinE;                         // divisor & dividend, possibly truncated to 32 bits
+  logic [`XLEN-1:0]   DnE;                                // DnE = ~DinE
+  logic [`XLEN-1:0]   DAbsBE;                             // absolute value of D
+  logic [`XLEN-1:0]   DAbsB;                              // registered absolute value of D, constant during division
+  logic [`XLEN-1:0]   XnE;                                // DXnE = ~XinE
+  logic [`XLEN-1:0]   XInitE;                             // |X|, or original X for divide by 0
+  logic [`XLEN-1:0]   WnM, XQnM;                          // negated residual W and quotient XQ for postprocessing sign correction
+  logic [STEPBITS:0]  step;                               // division step
+  logic               Div0E, Div0M;                       // divide by 0
+  logic               DivStartE;                          // start integer division
+  logic               SignXE, SignDE;                     // sign of dividend and divisor
+  logic               NegQE, NegWM, NegQM;                // negate quotient or residual during postprocessing
  
   //////////////////////////////
   // Execute Stage: prepare for division calculation with control logic, W logic and absolute values, initialize W and XQ
@@ -134,5 +143,3 @@ module intdivrestoring (
       else        state <= IDLE;
     end 
 endmodule 
-
-/* verilator lint_on UNOPTFLAT */
diff --git a/pipelined/src/mdu/intdivrestoringstep.sv b/pipelined/src/mdu/intdivrestoringstep.sv
index 95a26e82f..cc27a7d5b 100644
--- a/pipelined/src/mdu/intdivrestoringstep.sv
+++ b/pipelined/src/mdu/intdivrestoringstep.sv
@@ -4,8 +4,10 @@
 // Written: David_Harris@hmc.edu 2 October 2021
 // Modified: 
 //
-// Purpose: Restoring integer division using a shift register and subtractor
+// Purpose: Restoring integer division step.  k steps are used in intdivrestoring
 // 
+// Documentation: RISC-V System on Chip Design Chapter 12 (Figure 12.19)
+//
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
@@ -29,11 +31,16 @@
 /* verilator lint_off UNOPTFLAT */
 
 module intdivrestoringstep(
-  input  logic [`XLEN-1:0] W, XQ, DAbsB,
-  output logic [`XLEN-1:0] WOut, XQOut);
+  input  logic [`XLEN-1:0] W,     // Residual in
+  input  logic [`XLEN-1:0] XQ,    // bits of dividend X and quotient Q in
+  input  logic [`XLEN-1:0] DAbsB, // complement of absolute value of divisor D (for subtraction)
+  output logic [`XLEN-1:0] WOut,  // Residual out
+  output logic [`XLEN-1:0] XQOut  // bits of dividend and quotient out: discard one bit of X, append one bit of Q
+);
 
-  logic [`XLEN-1:0] WShift, WPrime;
-  logic qi, qib;
+  logic [`XLEN-1:0] WShift;       // Shift W left by one bit, bringing in most significant bit of X
+  logic [`XLEN-1:0] WPrime;       // WShift - D, for comparison and possible result
+  logic qi, qib;                  // Quotient digit and its complement
   
   assign {WShift, XQOut} = {W[`XLEN-2:0], XQ, qi};  // shift W and X/Q left, insert quotient bit at bottom
   adder #(`XLEN+1) wdsub({1'b0, WShift}, {1'b1, DAbsB}, {qib, WPrime}); // effective subtractor, carry out determines quotient bit
diff --git a/pipelined/src/mdu/mdu.sv b/pipelined/src/mdu/mdu.sv
index bb242b75f..4a85bf478 100644
--- a/pipelined/src/mdu/mdu.sv
+++ b/pipelined/src/mdu/mdu.sv
@@ -6,6 +6,8 @@
 //
 // Purpose: M extension multiply and divide
 // 
+// Documentation: RISC-V System on Chip Design Chapter 12 (Figure 12.21)
+//
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
@@ -26,56 +28,49 @@
 
 `include "wally-config.vh"
 
-module mdu (
-	       input logic 		clk, reset,
-	       // Execute Stage interface
-	       //    input logic [`XLEN-1:0] 	SrcAE, SrcBE,
-		   input logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
-	       input logic [2:0] 	Funct3E, Funct3M,
-	       input logic 		IntDivE, W64E, 
-	       // Writeback stage
-	       output logic [`XLEN-1:0] MDUResultW,
-	       // Divide Done
-	       output logic 		DivBusyE, 
-	       // hazards
-	       input logic 		StallM, StallW, FlushE, FlushM, FlushW 
-	       );
+module mdu(
+  input  logic 							clk, reset,
+  input  logic 							StallM, StallW, 
+  input  logic							FlushE, FlushM, FlushW,
+	input  logic [`XLEN-1:0] 	ForwardedSrcAE, ForwardedSrcBE, 	// inputs A and B from IEU forwarding mux output
+	input  logic [2:0] 				Funct3E, Funct3M,									// type of MDU operation
+	input  logic 							IntDivE, W64E, 										// Integer division/remainder, and W-type instrutions
+	output logic [`XLEN-1:0] 	MDUResultW,												// multiply/divide result
+	output logic 							DivBusyE													// busy signal to stall pipeline in Execute stage
+);
 
-	logic [`XLEN-1:0] MDUResultM;
-	logic [`XLEN-1:0] PrelimResultM;
-	logic [`XLEN-1:0] QuotM, RemM;
-	logic [`XLEN*2-1:0] ProdM; 
-
-	logic 		     DivSignedE;	
-	logic           W64M; 
+	logic [`XLEN*2-1:0] 			ProdM; 														// double-width product from mul
+	logic [`XLEN-1:0] 				QuotM, RemM;											// quotient and remainder from intdivrestoring
+	logic [`XLEN-1:0] 				PrelimResultM;										// selected result before W truncation
+	logic [`XLEN-1:0] 				MDUResultM;												// result after W truncation
+	logic           					W64M; 														// W-type instruction
 
 	// Multiplier
 	mul mul(.clk, .reset, .StallM, .FlushM, .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .ProdM);
 
-	// Divide
+	// Divider
 	// Start a divide when a new division instruction is received and the divider isn't already busy or finishing
-	// When F extensions are supported, use the FPU divider instead
+	// When IDIV_ON_FPU is set, use the FPU divider instead
 	if (`IDIV_ON_FPU) begin  
 	  assign QuotM = 0;
 	  assign RemM = 0;
 	  assign DivBusyE = 0;
 	end else begin
-		assign DivSignedE = ~Funct3E[0];
-		intdivrestoring div(.clk, .reset, .StallM, .FlushE, .DivSignedE, .W64E, .IntDivE, 
+		intdivrestoring div(.clk, .reset, .StallM, .FlushE, .DivSignedE(~Funct3E[0]), .W64E, .IntDivE, 
 							.ForwardedSrcAE, .ForwardedSrcBE, .DivBusyE, .QuotM, .RemM);
 	end
 		
 	// Result multiplexer
 	always_comb
 		case (Funct3M)	   
-			3'b000: PrelimResultM = ProdM[`XLEN-1:0];
-			3'b001: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
-			3'b010: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
-			3'b011: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];
-			3'b100: PrelimResultM = QuotM;
-			3'b101: PrelimResultM = QuotM;
-			3'b110: PrelimResultM = RemM;
-			3'b111: PrelimResultM = RemM;
+			3'b000: PrelimResultM = ProdM[`XLEN-1:0];					// mul
+			3'b001: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];		// mulh
+			3'b010: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];		// mulhsu
+			3'b011: PrelimResultM = ProdM[`XLEN*2-1:`XLEN];		// mulhu
+			3'b100: PrelimResultM = QuotM;										// div
+			3'b101: PrelimResultM = QuotM;										// divu
+			3'b110: PrelimResultM = RemM;											// rem
+			3'b111: PrelimResultM = RemM;											// remu
 		endcase 
 
 	// Handle sign extension for W-type instructions
diff --git a/pipelined/src/mdu/mul.sv b/pipelined/src/mdu/mul.sv
index b94ce7993..952b4daf2 100644
--- a/pipelined/src/mdu/mul.sv
+++ b/pipelined/src/mdu/mul.sv
@@ -4,8 +4,10 @@
 // Written: David_Harris@hmc.edu 16 February 2021
 // Modified: 
 //
-// Purpose: Multiply instructions
+// Purpose: Integer multiplication
 // 
+// Documentation: RISC-V System on Chip Design Chapter 12 (Figure 12.18)
+//
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
@@ -26,69 +28,68 @@
 
 `include "wally-config.vh"
 
-module mul (
-  // Execute Stage interface
-  input  logic             clk, reset,
-  input  logic             StallM, FlushM,
-    //    input logic [`XLEN-1:0] 	SrcAE, SrcBE,
-  input logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
-  input  logic [2:0]       Funct3E,
-  output logic [`XLEN*2-1:0] ProdM
+module mul(
+  input  logic                clk, reset,
+  input  logic                StallM, FlushM,
+  input  logic [`XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // source A and B from after Forwarding mux
+  input  logic [2:0]          Funct3E,                        // type of multiply
+  output logic [`XLEN*2-1:0]  ProdM                           // double-widthproduct
 );
 
-    // Number systems
-    // Let A' = sum(i=0, XLEN-2, A[i]*2^i)
-    // Unsigned: A = A' + A[XLEN-1]*2^(XLEN-1)
-    // Signed:   A = A' - A[XLEN-1]*2^(XLEN-1)
+  // Number systems
+  // Let A' = sum(i=0, XLEN-2, A[i]*2^i)
+  // Unsigned: A = A' + A[XLEN-1]*2^(XLEN-1)
+  // Signed:   A = A' - A[XLEN-1]*2^(XLEN-1)
 
-    // Multiplication: A*B
-    // Let P' = A' * B'
-    //     PA = (A' * B[XLEN-1]) 
-    //     PB = (B' * A[XLEN-1])
-    //     PP = A[XLEN-1] * B[XLEN-1]
-    // Signed * Signed     = P' + (-PA - PB)*2^(XLEN-1) + PP*2^(2XLEN-2)
-    // Signed * Unsigned   = P' + ( PA - PB)*2^(XLEN-1) - PP*2^(2XLEN-2)
-    // Unsigned * Unsigned = P' + ( PA + PB)*2^(XLEN-1) + PP*2^(2XLEN-2)
-
-    logic [`XLEN*2-1:0] PP1E, PP2E, PP3E, PP4E;
-    logic [`XLEN*2-1:0] PP1M, PP2M, PP3M, PP4M;
-    logic [`XLEN-2:0]   PA, PB;
-    logic               PP;
-    logic               MULH, MULHSU;
-    logic [`XLEN-1:0]   Aprime, Bprime;
+  // Multiplication: A*B
+  // Let P' = A' * B'
+  //     PA = (A' * B[XLEN-1]) 
+  //     PB = (B' * A[XLEN-1])
+  //     PP = A[XLEN-1] * B[XLEN-1]
+  // Signed * Signed     = P' + (-PA - PB)*2^(XLEN-1) + PP*2^(2XLEN-2)
+  // Signed * Unsigned   = P' + ( PA - PB)*2^(XLEN-1) - PP*2^(2XLEN-2)
+  // Unsigned * Unsigned = P' + ( PA + PB)*2^(XLEN-1) + PP*2^(2XLEN-2)
 
+  logic [`XLEN-1:0]   Aprime, Bprime;                       // lower bits of source A and B
+  logic               MULH, MULHSU;                         // type of multiply
+  logic [`XLEN-2:0]   PA, PB;                               // product of msb and lsbs
+  logic               PP;                                   // product of msbs
+  logic [`XLEN*2-1:0] PP1E, PP2E, PP3E, PP4E;               // partial products
+  logic [`XLEN*2-1:0] PP1M, PP2M, PP3M, PP4M;               // registered partial proudcts
+ 
   //////////////////////////////
   // Execute Stage: Compute partial products
   //////////////////////////////
 
-    assign Aprime = {1'b0, ForwardedSrcAE[`XLEN-2:0]};
-    assign Bprime = {1'b0, ForwardedSrcBE[`XLEN-2:0]};
-    assign PP1E = Aprime * Bprime;
-    assign PA = {(`XLEN-1){ForwardedSrcAE[`XLEN-1]}} & ForwardedSrcBE[`XLEN-2:0];  
-    assign PB = {(`XLEN-1){ForwardedSrcBE[`XLEN-1]}} & ForwardedSrcAE[`XLEN-2:0];
-    assign PP = ForwardedSrcAE[`XLEN-1] & ForwardedSrcBE[`XLEN-1];
+  assign Aprime = {1'b0, ForwardedSrcAE[`XLEN-2:0]};
+  assign Bprime = {1'b0, ForwardedSrcBE[`XLEN-2:0]};
+  assign PP1E = Aprime * Bprime;
+  assign PA = {(`XLEN-1){ForwardedSrcAE[`XLEN-1]}} & ForwardedSrcBE[`XLEN-2:0];  
+  assign PB = {(`XLEN-1){ForwardedSrcBE[`XLEN-1]}} & ForwardedSrcAE[`XLEN-2:0];
+  assign PP = ForwardedSrcAE[`XLEN-1] & ForwardedSrcBE[`XLEN-1];
 
-    // flavor of multiplication
-    assign MULH   = (Funct3E == 3'b001);
-    assign MULHSU = (Funct3E == 3'b010);
+  // flavor of multiplication
+  assign MULH   = (Funct3E == 3'b001);
+  assign MULHSU = (Funct3E == 3'b010);
 
-    // Handle signs
-    assign PP2E = {2'b00, (MULH | MULHSU) ? ~PA : PA, {(`XLEN-1){1'b0}}};
-    assign PP3E = {2'b00, (MULH) ? ~PB : PB, {(`XLEN-1){1'b0}}};
-    always_comb 
-    if (MULH)        PP4E = {1'b1, PP, {(`XLEN-3){1'b0}}, 1'b1, {(`XLEN){1'b0}}}; 
-    else if (MULHSU) PP4E = {1'b1, ~PP, {(`XLEN-2){1'b0}}, 1'b1, {(`XLEN-1){1'b0}}};
-    else             PP4E = {1'b0, PP, {(`XLEN*2-2){1'b0}}};
+  // Select partial products, handling signed multiplication
+  assign PP2E = {2'b00, (MULH | MULHSU) ? ~PA : PA, {(`XLEN-1){1'b0}}};
+  assign PP3E = {2'b00, (MULH) ? ~PB : PB, {(`XLEN-1){1'b0}}};
+  always_comb 
+  if (MULH)        PP4E = {1'b1, PP, {(`XLEN-3){1'b0}}, 1'b1, {(`XLEN){1'b0}}}; 
+  else if (MULHSU) PP4E = {1'b1, ~PP, {(`XLEN-2){1'b0}}, 1'b1, {(`XLEN-1){1'b0}}};
+  else             PP4E = {1'b0, PP, {(`XLEN*2-2){1'b0}}};
 
   //////////////////////////////
   // Memory Stage: Sum partial proudcts
   //////////////////////////////
 
-	 flopenrc #(`XLEN*2) PP1Reg(clk, reset, FlushM, ~StallM, PP1E, PP1M); 
-	 flopenrc #(`XLEN*2) PP2Reg(clk, reset, FlushM, ~StallM, PP2E, PP2M); 
-	 flopenrc #(`XLEN*2) PP3Reg(clk, reset, FlushM, ~StallM, PP3E, PP3M); 
-	 flopenrc #(`XLEN*2) PP4Reg(clk, reset, FlushM, ~StallM, PP4E, PP4M); 
+  flopenrc #(`XLEN*2) PP1Reg(clk, reset, FlushM, ~StallM, PP1E, PP1M); 
+  flopenrc #(`XLEN*2) PP2Reg(clk, reset, FlushM, ~StallM, PP2E, PP2M); 
+  flopenrc #(`XLEN*2) PP3Reg(clk, reset, FlushM, ~StallM, PP3E, PP3M); 
+  flopenrc #(`XLEN*2) PP4Reg(clk, reset, FlushM, ~StallM, PP4E, PP4M); 
 
-    assign ProdM = PP1M + PP2M + PP3M + PP4M; //ForwardedSrcAE * ForwardedSrcBE;
+  // add up partial products; this multi-input add implies CSAs and a final CPA
+  assign ProdM = PP1M + PP2M + PP3M + PP4M; //ForwardedSrcAE * ForwardedSrcBE;
  endmodule