From fec40a1b75f8a929760baa22ee8becabbb84850a Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 25 May 2021 14:26:22 -0500 Subject: [PATCH 01/19] fixed bug with icache miss spill fsm branch. --- .../src/ifu/globalHistoryPredictor.sv | 78 +++++-------------- wally-pipelined/src/ifu/icache.sv | 22 ++++-- 2 files changed, 34 insertions(+), 66 deletions(-) diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv index 087458df..b2357ecc 100644 --- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv +++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv @@ -32,76 +32,34 @@ module globalHistoryPredictor ) (input logic clk, input logic reset, - input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, + input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, input logic [`XLEN-1:0] LookUpPC, output logic [1:0] Prediction, // update input logic [`XLEN-1:0] UpdatePC, input logic UpdateEN, PCSrcE, input logic [1:0] UpdatePrediction - + ); - logic [k-1:0] GHRF, GHRFNext; - assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; + logic [k-1:0] GHRF, GHRFNext; + assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; - flopenr #(k) GlobalHistoryRegister(.clk(clk), - .reset(reset), - .en(UpdateEN), - .d(GHRFNext), - .q(GHRF)); - - - - logic [1:0] PredictionMemory; - logic DoForwarding, DoForwardingF; - logic [1:0] UpdatePredictionF; - + flopenr #(k) GlobalHistoryRegister(.clk(clk), + .reset(reset), + .en(UpdateEN), + .d(GHRFNext), + .q(GHRF)); // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT - // GHR referes to the address that the past k branches points to in the prediction stage - // GHRE refers to the address that the past k branches points to in the exectution stage - SRAM2P1R1W #(k, 2) PHT(.clk(clk), - .reset(reset), - .RA1(GHRF), - .RD1(PredictionMemory), - .REN1(~StallF), - .WA1(GHRFNext), - .WD1(UpdatePrediction), - .WEN1(UpdateEN), - .BitWEN1(2'b11)); + SRAM2P1R1W #(k, 2) PHT(.clk(clk), + .reset(reset), + .RA1(GHRF), + .RD1(Prediction), + .REN1(~StallF), + .WA1(GHRF), + .WD1(UpdatePrediction), + .WEN1(UpdateEN), + .BitWEN1(2'b11)); - // need to forward when updating to the same address as reading. - // first we compare to see if the update and lookup addreses are the same - assign DoForwarding = GHRF == GHRFNext; - - // register the update value and the forwarding signal into the Fetch stage - // TODO: add stall logic *** - flopr #(1) DoForwardingReg(.clk(clk), - .reset(reset), - .d(DoForwarding), - .q(DoForwardingF)); - - flopr #(2) UpdatePredictionReg(.clk(clk), - .reset(reset), - .d(UpdatePrediction), - .q(UpdatePredictionF)); - - assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory; - - //pipeline for GHR - /*flopenrc #(k) GHRDReg(.clk(clk), - .reset(reset), - .en(~StallD), - .clear(FlushD), - .d(GHRF), - .q(GHRD)); - - flopenrc #(k) GHREReg(.clk(clk), - .reset(reset), - .en(~StallE), - .clear(FlushE), - .d(GHRD), - .q(GHRE)); -*/ endmodule diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 9e30a083..4f51edd7 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -154,15 +154,16 @@ module icachecontroller #(parameter LINESIZE = 256) ( localparam STATE_MISS_SPILL_FETCH_DONE = 10; // write data into SRAM/LUT localparam STATE_MISS_SPILL_READ1 = 11; // read block 0 from SRAM/LUT localparam STATE_MISS_SPILL_2 = 12; // return to ready if hit or do second block update. - localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 13; // miss on block 1, issue read to AHB and wait - localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 14; // write data to SRAM/LUT - localparam STATE_MISS_SPILL_MERGE = 15; // read block 0 of CPU access, + localparam STATE_MISS_SPILL_2_START = 13; // return to ready if hit or do second block update. + localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 14; // miss on block 1, issue read to AHB and wait + localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 15; // write data to SRAM/LUT + localparam STATE_MISS_SPILL_MERGE = 16; // read block 0 of CPU access, - localparam STATE_MISS_SPILL_FINAL = 16; // this state replicates STATE_READY's replay of the + localparam STATE_MISS_SPILL_FINAL = 17; // this state replicates STATE_READY's replay of the // spill access but does nto consider spill. It also does not do another operation. - localparam STATE_INVALIDATE = 17; // *** not sure if invalidate or evict? invalidate by cache block or address? + localparam STATE_INVALIDATE = 18; // *** not sure if invalidate or evict? invalidate by cache block or address? localparam AHBByteLength = `XLEN / 8; localparam AHBOFFETWIDTH = $clog2(AHBByteLength); @@ -380,11 +381,20 @@ module icachecontroller #(parameter LINESIZE = 256) ( PCMux = 2'b10; UnalignedSelect = 1'b1; spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm. + ICacheReadEn = 1'b1; + NextState = STATE_MISS_SPILL_2_START; + end + STATE_MISS_SPILL_2_START: begin if (~hit) begin CntReset = 1'b1; NextState = STATE_MISS_SPILL_MISS_FETCH_WDV; end else begin - NextState = STATE_MISS_SPILL_FINAL; + NextState = STATE_READY; + ICacheReadEn = 1'b1; + PCMux = 2'b00; + UnalignedSelect = 1'b1; + SavePC = 1'b1; + ICacheStallF = 1'b0; end end STATE_MISS_SPILL_MISS_FETCH_WDV: begin From 7e84c3f51481d788f0714ce6f128a4eff881290b Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Thu, 27 May 2021 11:48:29 -0500 Subject: [PATCH 02/19] Updated benchmarking code. --- testsBP/crt0/Makefile | 4 ++-- testsBP/crt0/start.s | 7 +++---- testsBP/mibench_qsort/Makefile | 2 +- testsBP/sieve/Makefile | 2 +- testsBP/sieve/sieve.c | 20 ++++++++++---------- testsBP/simple/Makefile | 2 +- testsBP/simple/header.h | 1 + testsBP/simple/main.c | 1 + 8 files changed, 20 insertions(+), 19 deletions(-) diff --git a/testsBP/crt0/Makefile b/testsBP/crt0/Makefile index ab47384f..b42e86cb 100644 --- a/testsBP/crt0/Makefile +++ b/testsBP/crt0/Makefile @@ -9,7 +9,7 @@ MABI :=-mabi=lp64 LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles AFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -W -CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -mcmodel=medany +CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -mcmodel=medany -O2 AS=riscv64-unknown-elf-as CC=riscv64-unknown-elf-gcc AR=riscv64-unknown-elf-ar @@ -19,7 +19,7 @@ all: libcrt0.a %.o: %.s ${AS} ${AFLAGS} -c $< -o $@ -libcrt0.a: start.o +libcrt0.a: start.o pcnt_driver.o pre_main.o ${AR} -r $@ $^ clean: diff --git a/testsBP/crt0/start.s b/testsBP/crt0/start.s index 19a240d8..731a61e3 100644 --- a/testsBP/crt0/start.s +++ b/testsBP/crt0/start.s @@ -43,11 +43,10 @@ _start: - # set the stack pointer to the top of memory - # 0x8000_0000 + 64K - 8 bytes - li sp, 0x007FFFF8 + # set the stack pointer to the top of memory - 8 bytes (pointer size) + li sp, 0x07FFFFF8 - jal ra, main + jal ra, pre_main jal ra, _halt .section .text diff --git a/testsBP/mibench_qsort/Makefile b/testsBP/mibench_qsort/Makefile index f4d36839..b1cf7b67 100644 --- a/testsBP/mibench_qsort/Makefile +++ b/testsBP/mibench_qsort/Makefile @@ -8,7 +8,7 @@ MARCH :=-march=rv64ic MABI :=-mabi=lp64 LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map -CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align +CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align -O2 CC=riscv64-unknown-elf-gcc DA=riscv64-unknown-elf-objdump -d diff --git a/testsBP/sieve/Makefile b/testsBP/sieve/Makefile index 1d38d123..9c884f48 100644 --- a/testsBP/sieve/Makefile +++ b/testsBP/sieve/Makefile @@ -8,7 +8,7 @@ MARCH :=-march=rv64ic MABI :=-mabi=lp64 LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map -CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align +CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align -O2 CC=riscv64-unknown-elf-gcc DA=riscv64-unknown-elf-objdump -d diff --git a/testsBP/sieve/sieve.c b/testsBP/sieve/sieve.c index e8207404..f7d36d95 100644 --- a/testsBP/sieve/sieve.c +++ b/testsBP/sieve/sieve.c @@ -66,21 +66,21 @@ int main () { ans = sieve (); //gettimeofday(&after , NULL); - if (ans != 1899) - printf ("Sieve result wrong, ans = %d, expected 1899", ans); + /* /\* /\\* if (ans != 1899) *\\/ *\/ */ + /* /\* /\\* printf ("Sieve result wrong, ans = %d, expected 1899", ans); *\\/ *\/ */ - //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); + /* /\* //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); *\/ */ - printf("Round 2\n"); - //gettimeofday(&before , NULL); + /* /\* printf("Round 2\n"); *\/ */ + /* //gettimeofday(&before , NULL); */ - ans = sieve (); - //gettimeofday(&after , NULL); - if (ans != 1899) - printf ("Sieve result wrong, ans = %d, expected 1899", ans); + /* ans = sieve (); */ + /* //gettimeofday(&after , NULL); */ + /* if (ans != 1899) */ + /* printf ("Sieve result wrong, ans = %d, expected 1899", ans); */ - //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); + /* //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); */ return 0; diff --git a/testsBP/simple/Makefile b/testsBP/simple/Makefile index 450aacaa..4447f284 100644 --- a/testsBP/simple/Makefile +++ b/testsBP/simple/Makefile @@ -8,7 +8,7 @@ MARCH :=-march=rv64ic MABI :=-mabi=lp64 LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map -CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align +CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align -O2 CC=riscv64-unknown-elf-gcc DA=riscv64-unknown-elf-objdump -d diff --git a/testsBP/simple/header.h b/testsBP/simple/header.h index bfe014a4..6def656f 100644 --- a/testsBP/simple/header.h +++ b/testsBP/simple/header.h @@ -5,4 +5,5 @@ int fail(); int simple_csrbr_test(); int lbu_test(); int icache_spill_test(); +void global_hist_test(); #endif diff --git a/testsBP/simple/main.c b/testsBP/simple/main.c index 0d14fcfb..036a351d 100644 --- a/testsBP/simple/main.c +++ b/testsBP/simple/main.c @@ -2,6 +2,7 @@ int main(){ //int res = icache_spill_test(); + global_hist_test(); int res = 1; if (res < 0) { fail(); From 8a035104ac47678fc1de4fc1110511c5334233ae Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Thu, 27 May 2021 23:06:28 -0500 Subject: [PATCH 03/19] It's a bit sloppy, but the global history predictor is working correctly now. There were two major bugs with the predictor. First the update mechanism was completely wrong. The PHT is updated with the GHR that was used to lookup the prediction. PHT[GHR] = Sat2(PHT[GHR], branch outcome). Second the GHR needs to be updated speculatively as the branch is predicted. This is important so that back to back branches' GHRs are not the same. The must be different to avoid aliasing. Speculation of the GHR update allows them to be different. On mis prediction the GHR must be reverted. This implementation is a bit sloppy with names and now the GHR recovery is performed. Updates to follow. --- wally-pipelined/config/rv64BP/wally-config.vh | 3 +- wally-pipelined/src/ifu/bpred.sv | 9 ++-- .../src/ifu/globalHistoryPredictor.sv | 47 +++++++++++++++++-- wally-pipelined/src/ifu/ifu.sv | 9 +--- 4 files changed, 51 insertions(+), 17 deletions(-) diff --git a/wally-pipelined/config/rv64BP/wally-config.vh b/wally-pipelined/config/rv64BP/wally-config.vh index 17a8c284..fd482bfd 100644 --- a/wally-pipelined/config/rv64BP/wally-config.vh +++ b/wally-pipelined/config/rv64BP/wally-config.vh @@ -110,5 +110,6 @@ `define TWO_BIT_PRELOAD "../config/rv64icfd/twoBitPredictor.txt" `define BTB_PRELOAD "../config/rv64icfd/BTBPredictor.txt" `define BPRED_ENABLED 1 -`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE +//`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE +`define BPTYPE "BPGLOBAL" // BPTWOBIT or "BPGSHARE" or BPLOCALPAg or BPGSHARE `define TESTSBP 1 diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv index de0f8143..c5b4dde4 100644 --- a/wally-pipelined/src/ifu/bpred.sv +++ b/wally-pipelined/src/ifu/bpred.sv @@ -30,7 +30,8 @@ module bpred (input logic clk, reset, - input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, + input logic StallF, StallD, StallE, StallM, StallW, + input logic FlushF, FlushD, FlushE, FlushM, FlushW, // Fetch stage // the prediction input logic [`XLEN-1:0] PCNextF, // *** forgot to include this one on the I/O list @@ -93,6 +94,8 @@ module bpred // update .UpdatePC(PCE), .UpdateEN(InstrClassE[0] & ~StallE), + .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF), + .BPPredDirWrongE(BPPredDirWrongE), .PCSrcE(PCSrcE), .UpdatePrediction(UpdateBPPredE)); end else if (`BPTYPE == "BPGSHARE") begin:Predictor @@ -190,14 +193,14 @@ module bpred flopenrc #(2) BPPredRegD(.clk(clk), .reset(reset), .en(~StallD), - .clear(FlushD), + .clear(1'b0), .d(BPPredF), .q(BPPredD)); flopenrc #(2) BPPredRegE(.clk(clk), .reset(reset), .en(~StallE), - .clear(FlushE), + .clear(1'b0), .d(BPPredD), .q(BPPredE)); diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv index b2357ecc..fadbf004 100644 --- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv +++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv @@ -37,29 +37,66 @@ module globalHistoryPredictor output logic [1:0] Prediction, // update input logic [`XLEN-1:0] UpdatePC, - input logic UpdateEN, PCSrcE, + input logic UpdateEN, PCSrcE, + input logic SpeculativeUpdateEn, BPPredDirWrongE, input logic [1:0] UpdatePrediction ); - logic [k-1:0] GHRF, GHRFNext; - assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; + logic [k-1:0] GHRF, GHRFNext, GHRD, GHRE, GHRLookup; + + logic FlushedD, FlushedE; + + + // if the prediction is wrong we need to restore the ghr. + assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : + {Prediction[1], GHRF[k-1:1]}; flopenr #(k) GlobalHistoryRegister(.clk(clk), .reset(reset), - .en(UpdateEN), + .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)), .d(GHRFNext), .q(GHRF)); + // if actively updating the GHR at the time of prediction we want to us + // GHRFNext as the lookup rather than GHRF. + + assign GHRLookup = UpdateEN ? GHRFNext : GHRF; + // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT SRAM2P1R1W #(k, 2) PHT(.clk(clk), .reset(reset), .RA1(GHRF), .RD1(Prediction), .REN1(~StallF), - .WA1(GHRF), + .WA1(GHRE), .WD1(UpdatePrediction), .WEN1(UpdateEN), .BitWEN1(2'b11)); + flopenr #(k) GlobalHistoryRegisterD(.clk(clk), + .reset(reset), + .en(~StallD & ~FlushedE), + .d(GHRF), + .q(GHRD)); + + flopenr #(k) GlobalHistoryRegisterE(.clk(clk), + .reset(reset), + .en(~StallE & ~ FlushedE), + .d(GHRD), + .q(GHRE)); + + + flopenr #(1) flushedDReg(.clk(clk), + .reset(reset), + .en(~StallD), + .d(FlushD), + .q(FlushedD)); + + flopenr #(1) flushedEReg(.clk(clk), + .reset(reset), + .en(~StallE), + .d(FlushE | FlushedD), + .q(FlushedE)); + endmodule diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv index 994288bd..0922f787 100644 --- a/wally-pipelined/src/ifu/ifu.sv +++ b/wally-pipelined/src/ifu/ifu.sv @@ -153,14 +153,7 @@ module ifu ( generate if (`BPRED_ENABLED == 1) begin : bpred // I am making the port connection explicit for now as I want to see them and they will be changing. - bpred bpred(.clk(clk), - .reset(reset), - .StallF(StallF), - .StallD(StallD), - .StallE(StallE), - .FlushF(FlushF), - .FlushD(FlushD), - .FlushE(FlushE), + bpred bpred(.*, .PCNextF(PCNextF), .BPPredPCF(BPPredPCF), .SelBPPredF(SelBPPredF), From 690815ca51d0ca325c710068f9a0824538f9d4b0 Mon Sep 17 00:00:00 2001 From: Kip Macsai-Goren Date: Fri, 28 May 2021 18:09:28 -0400 Subject: [PATCH 04/19] made priority encoder parameterizable --- wally-pipelined/src/mmu/priority_encoder.sv | 68 ++++++++------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/wally-pipelined/src/mmu/priority_encoder.sv b/wally-pipelined/src/mmu/priority_encoder.sv index e4a62ce1..dade2e83 100644 --- a/wally-pipelined/src/mmu/priority_encoder.sv +++ b/wally-pipelined/src/mmu/priority_encoder.sv @@ -4,7 +4,11 @@ // Written: tfleming@hmc.edu & jtorrey@hmc.edu 7 April 2021 // Based on implementation from https://www.allaboutcircuits.com/ip-cores/communication-controller/priority-encoder/ // *** Give proper LGPL attribution for above source -// Modified: +// Modified: Teo Ene 15 Apr 2021: +// Temporarily removed paramterized priority encoder for non-parameterized one +// To get synthesis working quickly +// Kmacsaigoren@hmc.edu 28 May 2021: +// Added working version of parameterized priority encoder. // // Purpose: One-hot encoding to binary encoder // @@ -27,51 +31,33 @@ `include "wally-config.vh" -// Teo Ene 04/15: -// Temporarily removed paramterized priority encoder for non-parameterized one -// To get synthesis working quickly module priority_encoder #(parameter BINARY_BITS = 3) ( - input logic [7:0] one_hot, - output logic [2:0] binary + input logic [2**BINARY_BITS - 1:0] one_hot, + output logic [BINARY_BITS - 1:0] binary ); - // localparam ONE_HOT_BITS = 2**BINARY_BITS; - - /* - genvar i, j; - generate - for (i = 0; i < ONE_HOT_BITS; i++) begin - for (j = 0; j < BINARY_BITS; j++) begin - if (i[j]) begin - assign binary[j] = one_hot[i]; - end - end - end - endgenerate - */ - - /* - logic [BINARY_BITS-1:0] binary_comb; - + integer i; always_comb begin - binary_comb = 0; - for (int i = 0; i < ONE_HOT_BITS; i++) - if (one_hot[i]) binary_comb = i; + binary = 0; + for (i = 0; i < 2**BINARY_BITS; i++) begin + if (one_hot[i]) binary = i; // prioritizes the most significant bit + end end + // *** triple check synthesizability here - assign binary = binary_comb; + // Ideally this mimics the following: + /* + always_comb begin + casex (one_hot) + 1xx ... x: binary = BINARY_BITS - 1; + 01x ... x: binary = BINARY_BITS - 2; + 001 ... x: binary = BINARY_BITS - 3; + + {...} + + 00 ... 1xx: binary = 2; + 00 ... 01x: binary = 1; + 00 ... 001: binary = 0; + end */ - always_comb - case (one_hot) - 8'h1: binary=3'h0; - 8'h2: binary=3'h1; - 8'h4: binary=3'h2; - 8'h8: binary=3'h3; - 8'h10: binary=3'h4; - 8'h20: binary=3'h5; - 8'h40: binary=3'h6; - 8'h80: binary=3'h7; - default: binary=3'h0; //should never happen - endcase - endmodule From 12c34c25f3f122c90b8a99f6fdb4590f721fe0d2 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Mon, 31 May 2021 08:36:19 -0400 Subject: [PATCH 05/19] Modify elements of generics for LZD and shifter wrote for integer divider. --- wally-pipelined/src/generic/lzd.sv | 195 +++++++++++++++++++++++++++ wally-pipelined/src/generic/lzd.sv~ | 195 +++++++++++++++++++++++++++ wally-pipelined/src/generic/shift.sv | 76 +++++++++++ wally-pipelined/src/muldiv/div.sv | 146 +------------------- 4 files changed, 471 insertions(+), 141 deletions(-) create mode 100755 wally-pipelined/src/generic/lzd.sv create mode 100755 wally-pipelined/src/generic/lzd.sv~ create mode 100755 wally-pipelined/src/generic/shift.sv diff --git a/wally-pipelined/src/generic/lzd.sv b/wally-pipelined/src/generic/lzd.sv new file mode 100755 index 00000000..98642c15 --- /dev/null +++ b/wally-pipelined/src/generic/lzd.sv @@ -0,0 +1,195 @@ +/////////////////////////////////////////// +// lzd.sv +// +// Written: James.Stine@okstate.edu 1 February 2021 +// Modified: +// +// Purpose: Integer Divide instructions +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" +/* verilator lint_off DECLFILENAME */ + +// Original idea came from V. G. Oklobdzija, "An algorithmic and novel +// design of a leading zero detector circuit: comparison with logic +// synthesis," in IEEE Transactions on Very Large Scale Integration +// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi: +// 10.1109/92.273153. + +// Modified to be more hierarchical + +module lzd2 (P, V, B); + + input logic [1:0] B; + + output logic P; + output logic V; + + assign V = B[0] | B[1]; + assign P = B[0] & ~B[1]; + +endmodule // lz2 + +module lzd_hier #(parameter WIDTH=8) + (input logic [WIDTH-1:0] B, + output logic [$clog2(WIDTH)-1:0] ZP, + output logic ZV); + + if (WIDTH == 128) + lzd128 lz127 (ZP, ZV, B); + else if (WIDTH == 64) + lzd64 lz64 (ZP, ZV, B); + else if (WIDTH == 32) + lzd32 lz32 (ZP, ZV, B); + else if (WIDTH == 16) + lzd16 lz16 (ZP, ZV, B); + else if (WIDTH == 8) + lzd8 lz8 (ZP, ZV, B); + else if (WIDTH == 4) + lzd4 lz4 (ZP, ZV, B); + +endmodule // lzd_hier + +module lzd4 (ZP, ZV, B); + + input logic [3:0] B; + + logic ZPa; + logic ZPb; + logic ZVa; + logic ZVb; + + output logic [1:0] ZP; + output logic ZV; + + lz2 l1(ZPa, ZVa, B[1:0]); + lz2 l2(ZPb, ZVb, B[3:2]); + + assign ZP[0:0] = ZVb ? ZPb : ZPa; + assign ZP[1] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd4 + +module lzd8 (ZP, ZV, B); + + input logic [7:0] B; + + logic [1:0] ZPa; + logic [1:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [2:0] ZP; + output logic ZV; + + lz4 l1(ZPa, ZVa, B[3:0]); + lz4 l2(ZPb, ZVb, B[7:4]); + + assign ZP[1:0] = ZVb ? ZPb : ZPa; + assign ZP[2] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd8 + +module lzd16 (ZP, ZV, B); + + input logic [15:0] B; + + logic [2:0] ZPa; + logic [2:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [3:0] ZP; + output logic ZV; + + lz8 l1(ZPa, ZVa, B[7:0]); + lz8 l2(ZPb, ZVb, B[15:8]); + + assign ZP[2:0] = ZVb ? ZPb : ZPa; + assign ZP[3] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd16 + +module lzd32 (ZP, ZV, B); + + input logic [31:0] B; + + logic [3:0] ZPa; + logic [3:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [4:0] ZP; + output logic ZV; + + lz16 l1(ZPa, ZVa, B[15:0]); + lz16 l2(ZPb, ZVb, B[31:16]); + + assign ZP[3:0] = ZVb ? ZPb : ZPa; + assign ZP[4] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd32 + +module lzd64 (ZP, ZV, B); + + input logic [63:0] B; + + logic [4:0] ZPa; + logic [4:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [5:0] ZP; + output logic ZV; + + lz32 l1(ZPa, ZVa, B[31:0]); + lz32 l2(ZPb, ZVb, B[63:32]); + + assign ZP[4:0] = ZVb ? ZPb : ZPa; + assign ZP[5] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd64 + +module lzd128 (ZP, ZV, B); + + input logic [127:0] B; + + logic [5:0] ZPa; + logic [5:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [6:0] ZP; + output logic ZV; + + lz64 l1(ZPa, ZVa, B[64:0]); + lz64 l2(ZPb, ZVb, B[127:63]); + + assign ZP[5:0] = ZVb ? ZPb : ZPa; + assign ZP[6] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd128 + +/* verilator lint_on DECLFILENAME */ diff --git a/wally-pipelined/src/generic/lzd.sv~ b/wally-pipelined/src/generic/lzd.sv~ new file mode 100755 index 00000000..bfffe5e5 --- /dev/null +++ b/wally-pipelined/src/generic/lzd.sv~ @@ -0,0 +1,195 @@ +/////////////////////////////////////////// +// lzd.sv +// +// Written: James.Stine@okstate.edu 1 February 2021 +// Modified: +// +// Purpose: Integer Divide instructions +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" +/* verilator lint_off DECLFILENAME */ + +// Original idea came from V. G. Oklobdzija, "An algorithmic and novel +// design of a leading zero detector circuit: comparison with logic +// synthesis," in IEEE Transactions on Very Large Scale Integration +// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi: +// 10.1109/92.273153. + +// Modified to be more hierarchical + +module lz2 (P, V, B); + + input logic [1:0] B; + + output logic P; + output logic V; + + assign V = B[0] | B[1]; + assign P = B[0] & ~B[1]; + +endmodule // lz2 + +module lzd_hier #(parameter WIDTH=8) + (input logic [WIDTH-1:0] B, + output logic [$clog2(WIDTH)-1:0] ZP, + output logic ZV); + + if (WIDTH == 128) + lz128 lzd127 (ZP, ZV, B); + else if (WIDTH == 64) + lz64 lzd64 (ZP, ZV, B); + else if (WIDTH == 32) + lz32 lzd32 (ZP, ZV, B); + else if (WIDTH == 16) + lz16 lzd16 (ZP, ZV, B); + else if (WIDTH == 8) + lz8 lzd8 (ZP, ZV, B); + else if (WIDTH == 4) + lz4 lzd4 (ZP, ZV, B); + +endmodule // lzd_hier + +module lz4 (ZP, ZV, B); + + input logic [3:0] B; + + logic ZPa; + logic ZPb; + logic ZVa; + logic ZVb; + + output logic [1:0] ZP; + output logic ZV; + + lz2 l1(ZPa, ZVa, B[1:0]); + lz2 l2(ZPb, ZVb, B[3:2]); + + assign ZP[0:0] = ZVb ? ZPb : ZPa; + assign ZP[1] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule + +module lz8 (ZP, ZV, B); + + input logic [7:0] B; + + logic [1:0] ZPa; + logic [1:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [2:0] ZP; + output logic ZV; + + lz4 l1(ZPa, ZVa, B[3:0]); + lz4 l2(ZPb, ZVb, B[7:4]); + + assign ZP[1:0] = ZVb ? ZPb : ZPa; + assign ZP[2] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule + +module lz16 (ZP, ZV, B); + + input logic [15:0] B; + + logic [2:0] ZPa; + logic [2:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [3:0] ZP; + output logic ZV; + + lz8 l1(ZPa, ZVa, B[7:0]); + lz8 l2(ZPb, ZVb, B[15:8]); + + assign ZP[2:0] = ZVb ? ZPb : ZPa; + assign ZP[3] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lz16 + +module lz32 (ZP, ZV, B); + + input logic [31:0] B; + + logic [3:0] ZPa; + logic [3:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [4:0] ZP; + output logic ZV; + + lz16 l1(ZPa, ZVa, B[15:0]); + lz16 l2(ZPb, ZVb, B[31:16]); + + assign ZP[3:0] = ZVb ? ZPb : ZPa; + assign ZP[4] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lz32 + +module lz64 (ZP, ZV, B); + + input logic [63:0] B; + + logic [4:0] ZPa; + logic [4:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [5:0] ZP; + output logic ZV; + + lz32 l1(ZPa, ZVa, B[31:0]); + lz32 l2(ZPb, ZVb, B[63:32]); + + assign ZP[4:0] = ZVb ? ZPb : ZPa; + assign ZP[5] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lz64 + +module lz128 (ZP, ZV, B); + + input logic [127:0] B; + + logic [5:0] ZPa; + logic [5:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [6:0] ZP; + output logic ZV; + + lz64 l1(ZPa, ZVa, B[64:0]); + lz64 l2(ZPb, ZVb, B[127:63]); + + assign ZP[5:0] = ZVb ? ZPb : ZPa; + assign ZP[6] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lz128 + +/* verilator lint_on DECLFILENAME */ diff --git a/wally-pipelined/src/generic/shift.sv b/wally-pipelined/src/generic/shift.sv new file mode 100755 index 00000000..88152588 --- /dev/null +++ b/wally-pipelined/src/generic/shift.sv @@ -0,0 +1,76 @@ +/////////////////////////////////////////// +// shifters.sv +// +// Written: James.Stine@okstate.edu 1 February 2021 +// Modified: +// +// Purpose: Integer Divide instructions +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" +/* verilator lint_off DECLFILENAME */ +/* verilator lint_off UNOPTFLAT */ + +module shift_right #(parameter WIDTH=8) + (input logic [WIDTH-1:0] A, + input logic [$clog2(WIDTH)-1:0] Shift, + output logic [WIDTH-1:0] Z); + + logic [WIDTH-1:0] stage [$clog2(WIDTH):0]; + logic sign; + genvar i; + + assign stage[0] = A; + generate + for (i=0;i<$clog2(WIDTH);i=i+1) + begin : genbit + mux2 #(WIDTH) mux_inst (stage[i], + {{(WIDTH/(2**(i+1))){1'b0}}, stage[i][WIDTH-1:WIDTH/(2**(i+1))]}, + Shift[$clog2(WIDTH)-i-1], + stage[i+1]); + end + endgenerate + assign Z = stage[$clog2(WIDTH)]; + +endmodule // shift_right + +module shift_left #(parameter WIDTH=8) + (input logic [WIDTH-1:0] A, + input logic [$clog2(WIDTH)-1:0] Shift, + output logic [WIDTH-1:0] Z); + + logic [WIDTH-1:0] stage [$clog2(WIDTH):0]; + genvar i; + + assign stage[0] = A; + generate + for (i=0;i<$clog2(WIDTH);i=i+1) + begin : genbit + mux2 #(WIDTH) mux_inst (stage[i], + {stage[i][WIDTH-1-WIDTH/(2**(i+1)):0], {(WIDTH/(2**(i+1))){1'b0}}}, + Shift[$clog2(WIDTH)-i-1], + stage[i+1]); + end + endgenerate + assign Z = stage[$clog2(WIDTH)]; + +endmodule // shift_left + +/* verilator lint_on DECLFILENAME */ +/* verilator lint_on UNOPTFLAT */ diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv index db830ca3..4266ae61 100755 --- a/wally-pipelined/src/muldiv/div.sv +++ b/wally-pipelined/src/muldiv/div.sv @@ -78,11 +78,7 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); assign D_NegOne = &D; // Divider goes the distance to 37 cycles - // (thanks the evil divisor for D = 0x1) - // but could theoretically be stopped when - // divdone is asserted. The enable signal - // turns off register storage thus invalidating - // any future cycles. + // (thanks to the evil divisor for D = 0x1) // Shift D, if needed (for integer) // needed to allow qst to be in range for integer @@ -93,8 +89,8 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); // exception is given to FSM to tell the operation to // quit gracefully. - lz64 p1 (P, V, twoD); - shifter_l64 p2 (op2, twoD, P); + lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD)); + shift_left #(64) p2 (twoD, P, op2); assign op1 = twoN; assign div0 = ~V; @@ -141,9 +137,8 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); assign Q = Qd2[63:0]; assign Rem5 = Rd2[64:1]; - // Adjust remainder by m (no need to adjust by - // n ln(r) - shifter_r64 p4 (rem0, Rem5, RemShift); + // Adjust remainder by m + shift_right #(64) p4 (Rem5, RemShift, rem0); // Adjust Q/Rem for Signed assign tcQ = (SignN ^ SignD) & S; @@ -368,8 +363,6 @@ module qst4 (input logic [6:0] s, input logic [2:0] d, endmodule // qst4 -// LZD - module lz2 (P, V, B0, B1); input logic B0; @@ -497,7 +490,6 @@ module lz64 (ZP, ZV, B); endmodule // lz64 // FSM Control for Integer Divider - module fsm64 (en, state0, done, divdone, otfzero, divBusy, start, error, NumIter, clk, reset); @@ -1505,134 +1497,6 @@ module magcompare8 (LT, EQ, A, B); endmodule // magcompare8 -module shifter_l64 (Z, A, Shift); - - input logic [63:0] A; - input logic [5:0] Shift; - - logic [63:0] stage1; - logic [63:0] stage2; - logic [63:0] stage3; - logic [63:0] stage4; - logic [63:0] stage5; - - output logic [63:0] Z; - - mux2 #(64) mx01(A, {A[31:0], 32'h0}, Shift[5], stage1); - mux2 #(64) mx02(stage1, {stage1[47:0], 16'h0}, Shift[4], stage2); - mux2 #(64) mx03(stage2, {stage2[55:0], 8'h0}, Shift[3], stage3); - mux2 #(64) mx04(stage3, {stage3[59:0], 4'h0}, Shift[2], stage4); - mux2 #(64) mx05(stage4, {stage4[61:0], 2'h0}, Shift[1], stage5); - mux2 #(64) mx06(stage5, {stage5[62:0], 1'h0}, Shift[0], Z); - -endmodule // shifter_l64 - -module shifter_r64 (Z, A, Shift); - - input logic [63:0] A; - input logic [5:0] Shift; - - logic [63:0] stage1; - logic [63:0] stage2; - logic [63:0] stage3; - logic [63:0] stage4; - logic [63:0] stage5; - - output logic [63:0] Z; - - mux2 #(64) mx01(A, {32'h0, A[63:32]}, Shift[5], stage1); - mux2 #(64) mx02(stage1, {16'h0, stage1[63:16]}, Shift[4], stage2); - mux2 #(64) mx03(stage2, {8'h0, stage2[63:8]}, Shift[3], stage3); - mux2 #(64) mx04(stage3, {4'h0, stage3[63:4]}, Shift[2], stage4); - mux2 #(64) mx05(stage4, {2'h0, stage4[63:2]}, Shift[1], stage5); - mux2 #(64) mx06(stage5, {1'h0, stage5[63:1]}, Shift[0], Z); - -endmodule // shifter_r64 - -module shifter_l32 (Z, A, Shift); - - input logic [31:0] A; - input logic [4:0] Shift; - - logic [31:0] stage1; - logic [31:0] stage2; - logic [31:0] stage3; - logic [31:0] stage4; - - output logic [31:0] Z; - - mux2 #(32) mx01(A, {A[15:0], 16'h0}, Shift[4], stage1); - mux2 #(32) mx02(stage1, {stage1[23:0], 8'h0}, Shift[3], stage2); - mux2 #(32) mx03(stage2, {stage2[27:0], 4'h0}, Shift[2], stage3); - mux2 #(32) mx04(stage3, {stage3[29:0], 2'h0}, Shift[1], stage4); - mux2 #(32) mx05(stage4, {stage4[30:0], 1'h0}, Shift[0], Z); - -endmodule // shifter_l32 - -module shifter_r32 (Z, A, Shift); - - input logic [31:0] A; - input logic [4:0] Shift; - - logic [31:0] stage1; - logic [31:0] stage2; - logic [31:0] stage3; - logic [31:0] stage4; - - output logic [31:0] Z; - - mux2 #(32) mx01(A, {16'h0, A[31:16]}, Shift[4], stage1); - mux2 #(32) mx02(stage1, {8'h0, stage1[31:8]}, Shift[3], stage2); - mux2 #(32) mx03(stage2, {4'h0, stage2[31:4]}, Shift[2], stage3); - mux2 #(32) mx04(stage3, {2'h0, stage3[31:2]}, Shift[1], stage4); - mux2 #(32) mx05(stage4, {1'h0, stage4[31:1]}, Shift[0], Z); - -endmodule // shifter_r32 - -module shift_right #(parameter WIDTH=8) - (input logic [`XLEN-1:0] A, - input logic [$clog2(`XLEN)-1:0] Shift, - output logic [`XLEN-1:0] Z); - - logic [`XLEN-1:0] stage [$clog2(`XLEN):0]; - genvar i; - - assign stage[0] = A; - generate - for (i=0;i<$clog2(`XLEN);i=i+1) - begin : genbit - mux2 #(`XLEN) mux_inst (stage[i], - {{(`XLEN/(2**(i+1))){1'b0}}, stage[i][`XLEN-1:`XLEN/(2**(i+1))]}, - Shift[$clog2(`XLEN)-i-1], - stage[i+1]); - end - endgenerate - assign Z = stage[$clog2(`XLEN)]; - -endmodule // shift_right - -module shift_left #(parameter WIDTH=8) - (input logic [`XLEN-1:0] A, - input logic [$clog2(`XLEN)-1:0] Shift, - output logic [`XLEN-1:0] Z); - - logic [`XLEN-1:0] stage [$clog2(`XLEN):0]; - genvar i; - - assign stage[0] = A; - generate - for (i=0;i<$clog2(`XLEN);i=i+1) - begin : genbit - mux2 #(`XLEN) mux_inst (stage[i], - {stage[i][`XLEN-1-`XLEN/(2**(i+1)):0], {(`XLEN/(2**(i+1))){1'b0}}}, - Shift[$clog2(`XLEN)-i-1], - stage[i+1]); - end - endgenerate - assign Z = stage[$clog2(`XLEN)]; - -endmodule // shift_right - module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); input logic [63:0] Q; From 9954d16fc91017dae8df34f0b60f6ab188242708 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Mon, 31 May 2021 09:12:21 -0400 Subject: [PATCH 06/19] Add enhancements to integer divider including: - better comments - optimize FSM to end earlier - passes for 32-bit or 64-bit depending on parameter to intdiv Left div.bak in just in case have to revert back to original for now. --- wally-pipelined/src/muldiv/div.bak | 1560 ++++++++++++++++++++++++++ wally-pipelined/src/muldiv/div.sv | 614 ++++------ wally-pipelined/src/muldiv/muldiv.sv | 3 +- 3 files changed, 1773 insertions(+), 404 deletions(-) create mode 100755 wally-pipelined/src/muldiv/div.bak diff --git a/wally-pipelined/src/muldiv/div.bak b/wally-pipelined/src/muldiv/div.bak new file mode 100755 index 00000000..4266ae61 --- /dev/null +++ b/wally-pipelined/src/muldiv/div.bak @@ -0,0 +1,1560 @@ +/////////////////////////////////////////// +// mul.sv +// +// Written: James.Stine@okstate.edu 1 February 2021 +// Modified: +// +// Purpose: Integer Divide instructions +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +// *** I added these verilator controls to clean up the +// lint output. The linter warnings should be fixed, but now the output is at +// least readable. +/* verilator lint_off COMBDLY */ +/* verilator lint_off IMPLICIT */ + +`include "wally-config.vh" + +module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); + + input logic [63:0] N, D; + input logic clk; + input logic reset; + input logic start; + input logic S; + + output logic [63:0] Qf; + output logic [63:0] remf; + output logic div0; + output logic done; + output logic divBusy; + + logic divdone; + logic enable; + logic state0; + logic V; + logic [7:0] Num; + logic [5:0] P, NumIter, RemShift; + logic [63:0] op1, op2, op1shift, Rem5; + logic [64:0] Qd, Rd, Qd2, Rd2; + logic [63:0] Q, rem0; + logic [3:0] quotient; + logic otfzero; + logic shiftResult; + logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; + + logic [63:0] twoD; + logic [63:0] twoN; + logic SignD; + logic SignN; + logic [63:0] QT, remT; + logic D_NegOne; + logic Max_N; + + // Check if negative (two's complement) + // If so, convert to positive + adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD); + adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN); + assign SignD = D[63]; + assign SignN = N[63]; + // Max N and D = -1 (Overflow) + assign Max_N = (~|N[62:0]) & N[63]; + assign D_NegOne = &D; + + // Divider goes the distance to 37 cycles + // (thanks to the evil divisor for D = 0x1) + + // Shift D, if needed (for integer) + // needed to allow qst to be in range for integer + // division [1,2) and allow integer divide to work. + // + // The V or valid bit can be used to determine if D + // is 0 and thus a divide by 0 exception. This div0 + // exception is given to FSM to tell the operation to + // quit gracefully. + + lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD)); + shift_left #(64) p2 (twoD, P, op2); + assign op1 = twoN; + assign div0 = ~V; + + // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0) + // v = 2 since \rho < 1 (add 4 to make sure its a ceil) + adder #(8) cpa3 ({2'b0, P}, + {5'h0, shiftResult, ~shiftResult, 1'b0}, + Num); + + // Determine whether need to add just Q/Rem + assign shiftResult = P[0]; + // div by 2 (ceil) + assign NumIter = Num[6:1]; + assign RemShift = P; + + // FSM to control integer divider + // assume inputs are postive edge and + // datapath (divider) is negative edge + fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv, + start, div0, NumIter, ~clk, reset); + + flopr #(1) rega (~clk, reset, donev, done); + flopr #(1) regb (~clk, reset, divdonev, divdone); + flopr #(1) regc (~clk, reset, otfzerov, otfzero); + flopr #(1) regd (~clk, reset, enablev, enable); + flopr #(1) rege (~clk, reset, state0v, state0); + flopr #(1) regf (~clk, reset, divBusyv, divBusy); + + // To obtain a correct remainder the last bit of the + // quotient has to be aligned with a radix-r boundary. + // Since the quotient is in the range 1/2 < q < 2 (one + // integer bit and m fractional bits), this is achieved by + // shifting N right by v+s so that (m+v+s) mod k = 0. And, + // the quotient has to be aligned to the integer position. + + divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, + enable, otfzero, shiftResult); + + // Storage registers to hold contents stable + flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2); + flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2); + + // Probably not needed - just assigns results + assign Q = Qd2[63:0]; + assign Rem5 = Rd2[64:1]; + + // Adjust remainder by m + shift_right #(64) p4 (Rem5, RemShift, rem0); + + // Adjust Q/Rem for Signed + assign tcQ = (SignN ^ SignD) & S; + assign tcR = SignN & S; + // Signed Divide + // - When N and D are negative: Remainder is negative (undergoes a two's complement). + // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement). + // - When D is negative: Quotient is negative (undergoes a two's complement). + adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT); + adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT); + + // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec) + exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf); + +endmodule // int32div + +module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, + enable, otfzero, shiftResult); + + input logic [63:0] op1, op2; + input logic clk, state0; + input logic reset; + input logic enable; + input logic otfzero; + input logic shiftResult; + + output logic [64:0] rem0; + output logic [64:0] Q; + output logic [3:0] quotient; + + logic [67:0] Sum, Carry; + logic [64:0] Qstar; + logic [64:0] QMstar; + logic [7:0] qtotal; + logic [67:0] SumN, CarryN, SumN2, CarryN2; + logic [67:0] divi1, divi2, divi1c, divi2c, dive1; + logic [67:0] mdivi_temp, mdivi; + logic zero; + logic [1:0] qsel; + logic [1:0] Qin, QMin; + logic CshiftQ, CshiftQM; + logic [67:0] rem1, rem2, rem3; + logic [67:0] SumR, CarryR; + logic [64:0] Qt; + + // Create one's complement values of Divisor (for q*D) + assign divi1 = {3'h0, op2, 1'b0}; + assign divi2 = {2'h0, op2, 2'b0}; + assign divi1c = ~divi1; + assign divi2c = ~divi2; + // Shift x1 if not mod k + mux2 #(68) mx1 ({3'b000, op1, 1'b0}, {4'h0, op1}, shiftResult, dive1); + + // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D) + mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN); + mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN); + // Simplify QST + adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal); + // q = {+2, +1, -1, -2} else q = 0 + qst4 pd1 (qtotal[7:1], divi1[63:61], quotient); + assign ulp = quotient[2]|quotient[3]; + assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]); + // Map to binary encoding + assign qsel[1] = quotient[3]|quotient[2]; + assign qsel[0] = quotient[3]|quotient[1]; + mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp); + mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi); + csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry); + // regs : save CSA + flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2); + flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2); + // OTF + ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM); + otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, + otfzero, enable, Qstar, QMstar); + + // Correction and generation of Remainder + adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1); + // Add back +D as correction + csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR); + adder #(68) cpa3 (SumR, CarryR, rem2); + // Choose remainder (Rem or Rem+D) + mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3); + // Choose correct Q or QM + mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt); + // Final results + assign rem0 = rem3[64:0]; + assign Q = Qt; + +endmodule // divide4x64 + +module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM); + + input logic [3:0] quot; + + output logic [1:0] Qin; + output logic [1:0] QMin; + output logic CshiftQ; + output logic CshiftQM; + + // Load/Store Control for OTF + assign Qin[1] = (quot[1]) | (quot[3]) | (quot[0]); + assign Qin[0] = (quot[1]) | (quot[2]); + assign QMin[1] = (quot[1]) | (!quot[3]&!quot[2]&!quot[1]&!quot[0]); + assign QMin[0] = (quot[3]) | (quot[0]) | + (!quot[3]&!quot[2]&!quot[1]&!quot[0]); + assign CshiftQ = (quot[1]) | (quot[0]); + assign CshiftQM = (quot[3]) | (quot[2]); + +endmodule + +// On-the-fly Conversion per Ercegovac/Lang + +module otf #(parameter WIDTH=8) + (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q); + + input logic [1:0] Qin, QMin; + input logic CshiftQ, CshiftQM; + input logic clk; + input logic reset; + input logic enable; + + output logic [WIDTH-1:0] R2Q; + output logic [WIDTH-1:0] R1Q; + + logic [WIDTH-1:0] Qstar, QMstar; + logic [WIDTH-1:0] M1Q, M2Q; + + // QM + mux2 #(WIDTH) m1 (QMstar, Qstar, CshiftQM, M1Q); + flopenr #(WIDTH) r1 (clk, reset, enable, {M1Q[WIDTH-3:0], QMin}, R1Q); + // Q + mux2 #(WIDTH) m2 (Qstar, QMstar, CshiftQ, M2Q); + flopenr #(WIDTH) r2 (clk, reset, enable, {M2Q[WIDTH-3:0], Qin}, R2Q); + + assign Qstar = R2Q; + assign QMstar = R1Q; + +endmodule // otf8 + +module adder #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, + output logic [WIDTH-1:0] y); + + assign y = a + b; + +endmodule // adder + +module fa (input logic a, b, c, output logic sum, carry); + + assign sum = a^b^c; + assign carry = a&b|a&c|b&c; + +endmodule // fa + +module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c, + output logic [WIDTH-1:0] sum, carry); + + logic [WIDTH:0] carry_temp; + genvar i; + generate + for (i=0;i B. LT and GT are both '0' if A = B. + +module magcompare2b (LT, GT, A, B); + + input logic [1:0] A; + input logic [1:0] B; + + output logic LT; + output logic GT; + + // Determine if A < B using a minimized sum-of-products expression + assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0]; + // Determine if A > B using a minimized sum-of-products expression + assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0]; + +endmodule // magcompare2b + +// J. E. Stine and M. J. Schulte, "A combined two's complement and +// floating-point comparator," 2005 IEEE International Symposium on +// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. +// doi: 10.1109/ISCAS.2005.1464531 + +module magcompare8 (LT, EQ, A, B); + + input logic [7:0] A; + input logic [7:0] B; + + logic [3:0] s; + logic [3:0] t; + logic [1:0] u; + logic [1:0] v; + logic GT; + //wire LT; + + output logic EQ; + output logic LT; + + magcompare2b mag1 (s[0], t[0], A[1:0], B[1:0]); + magcompare2b mag2 (s[1], t[1], A[3:2], B[3:2]); + magcompare2b mag3 (s[2], t[2], A[5:4], B[5:4]); + magcompare2b mag4 (s[3], t[3], A[7:6], B[7:6]); + + magcompare2b mag5 (u[0], v[0], t[1:0], s[1:0]); + magcompare2b mag6 (u[1], v[1], t[3:2], s[3:2]); + + magcompare2b mag7 (LT, GT, v[1:0], u[1:0]); + + assign EQ = ~(GT | LT); + +endmodule // magcompare8 + +module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); + + input logic [63:0] Q; + input logic [63:0] rem; + input logic [63:0] op1; + input logic S; + input logic div0; + input logic Max_N; + input logic D_NegOne; + + output logic [63:0] Qf; + output logic [63:0] remf; + + // Needs to be optimized + always_comb + case ({div0, S, Max_N, D_NegOne}) + 4'b0000 : Qf = Q; + 4'b0001 : Qf = Q; + 4'b0010 : Qf = Q; + 4'b0011 : Qf = Q; + 4'b0100 : Qf = Q; + 4'b0101 : Qf = Q; + 4'b0110 : Qf = Q; + 4'b0111 : Qf = {1'b1, 31'h0}; + 4'b1000 : Qf = {64{1'b1}}; + 4'b1001 : Qf = {64{1'b1}}; + 4'b1010 : Qf = {64{1'b1}}; + 4'b1011 : Qf = {64{1'b1}}; + 4'b1100 : Qf = {64{1'b1}}; + 4'b1101 : Qf = {64{1'b1}}; + 4'b1110 : Qf = {64{1'b1}}; + 4'b1111 : Qf = {64{1'b1}}; + default: Qf = Q; + endcase + + always_comb + case ({div0, S, Max_N, D_NegOne}) + 4'b0000 : remf = rem; + 4'b0001 : remf = rem; + 4'b0010 : remf = rem; + 4'b0011 : remf = rem; + 4'b0100 : remf = rem; + 4'b0101 : remf = rem; + 4'b0110 : remf = rem; + 4'b0111 : remf = 64'h0; + 4'b1000 : remf = op1; + 4'b1001 : remf = op1; + 4'b1010 : remf = op1; + 4'b1011 : remf = op1; + 4'b1100 : remf = op1; + 4'b1101 : remf = op1; + 4'b1110 : remf = op1; + 4'b1111 : remf = op1; + default: remf = rem; + endcase + +endmodule // exception_int + +/* verilator lint_on COMBDLY */ +/* verilator lint_on IMPLICIT */ + diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv index 4266ae61..107b002f 100755 --- a/wally-pipelined/src/muldiv/div.sv +++ b/wally-pipelined/src/muldiv/div.sv @@ -1,5 +1,5 @@ /////////////////////////////////////////// -// mul.sv +// divide4x64.sv // // Written: James.Stine@okstate.edu 1 February 2021 // Modified: @@ -29,54 +29,53 @@ /* verilator lint_off COMBDLY */ /* verilator lint_off IMPLICIT */ -`include "wally-config.vh" +module intdiv #(parameter WIDTH=64) + (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); -module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); - - input logic [63:0] N, D; - input logic clk; - input logic reset; - input logic start; - input logic S; + input logic [WIDTH-1:0] N, D; + input logic clk; + input logic reset; + input logic start; + input logic S; + + output logic [WIDTH-1:0] Qf; + output logic [WIDTH-1:0] remf; + output logic div0; + output logic done; + output logic divBusy; + + logic enable; + logic state0; + logic V; + logic [$clog2(WIDTH):0] Num; + logic [$clog2(WIDTH)-1:0] P, NumIter, RemShift; + logic [WIDTH-1:0] op1, op2, op1shift, Rem5; + logic [WIDTH:0] Qd, Rd, Qd2, Rd2; + logic [WIDTH-1:0] Q, rem0; + logic [3:0] quotient; + logic otfzero; + logic shiftResult; + logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; + + logic [WIDTH-1:0] twoD; + logic [WIDTH-1:0] twoN; + logic SignD; + logic SignN; + logic [WIDTH-1:0] QT, remT; + logic D_NegOne; + logic Max_N; - output logic [63:0] Qf; - output logic [63:0] remf; - output logic div0; - output logic done; - output logic divBusy; - - logic divdone; - logic enable; - logic state0; - logic V; - logic [7:0] Num; - logic [5:0] P, NumIter, RemShift; - logic [63:0] op1, op2, op1shift, Rem5; - logic [64:0] Qd, Rd, Qd2, Rd2; - logic [63:0] Q, rem0; - logic [3:0] quotient; - logic otfzero; - logic shiftResult; - logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; - - logic [63:0] twoD; - logic [63:0] twoN; - logic SignD; - logic SignN; - logic [63:0] QT, remT; - logic D_NegOne; - logic Max_N; // Check if negative (two's complement) // If so, convert to positive - adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD); - adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN); - assign SignD = D[63]; - assign SignN = N[63]; + adder #(WIDTH) cpa1 ((D ^ {WIDTH{D[WIDTH-1]&S}}), {{WIDTH-1{1'b0}}, D[WIDTH-1]&S}, twoD); + adder #(WIDTH) cpa2 ((N ^ {WIDTH{N[WIDTH-1]&S}}), {{WIDTH-1{1'b0}}, N[WIDTH-1]&S}, twoN); + assign SignD = D[WIDTH-1]; + assign SignN = N[WIDTH-1]; // Max N and D = -1 (Overflow) - assign Max_N = (~|N[62:0]) & N[63]; + assign Max_N = (~|N[WIDTH-2:0]) & N[WIDTH-1]; assign D_NegOne = &D; - + // Divider goes the distance to 37 cycles // (thanks to the evil divisor for D = 0x1) @@ -89,31 +88,31 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); // exception is given to FSM to tell the operation to // quit gracefully. - lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD)); - shift_left #(64) p2 (twoD, P, op2); - assign op1 = twoN; + lzd_hier #(WIDTH) p1 (.ZP(P), .ZV(V), .B(twoD)); + shift_left #(WIDTH) p2 (twoD, P, op2); + assign op1 = twoN; assign div0 = ~V; - // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0) + // #iter: N = m+v+s = m+2+s (mod k = 0) // v = 2 since \rho < 1 (add 4 to make sure its a ceil) - adder #(8) cpa3 ({2'b0, P}, - {5'h0, shiftResult, ~shiftResult, 1'b0}, - Num); + // k = 2 (r = 2^k) + adder #($clog2(WIDTH)+1) cpa3 ({1'b0, P}, + {{$clog2(WIDTH)+1-3{1'b0}}, shiftResult, ~shiftResult, 1'b0}, + Num); // Determine whether need to add just Q/Rem assign shiftResult = P[0]; // div by 2 (ceil) - assign NumIter = Num[6:1]; + assign NumIter = Num[$clog2(WIDTH):1]; assign RemShift = P; // FSM to control integer divider // assume inputs are postive edge and // datapath (divider) is negative edge - fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv, - start, div0, NumIter, ~clk, reset); + fsm64 #($clog2(WIDTH)) fsm1 (enablev, state0v, donev, otfzerov, divBusyv, + start, div0, NumIter, ~clk, reset); flopr #(1) rega (~clk, reset, donev, done); - flopr #(1) regb (~clk, reset, divdonev, divdone); flopr #(1) regc (~clk, reset, otfzerov, otfzero); flopr #(1) regd (~clk, reset, enablev, enable); flopr #(1) rege (~clk, reset, state0v, state0); @@ -125,64 +124,66 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); // integer bit and m fractional bits), this is achieved by // shifting N right by v+s so that (m+v+s) mod k = 0. And, // the quotient has to be aligned to the integer position. - - divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, - enable, otfzero, shiftResult); + divide4 #(WIDTH) p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, + enable, otfzero, shiftResult); // Storage registers to hold contents stable - flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2); - flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2); + flopenr #(WIDTH+1) reg3 (clk, reset, enable, Rd, Rd2); + flopenr #(WIDTH+1) reg4 (clk, reset, enable, Qd, Qd2); // Probably not needed - just assigns results - assign Q = Qd2[63:0]; - assign Rem5 = Rd2[64:1]; + assign Q = Qd2[WIDTH-1:0]; + assign Rem5 = Rd2[WIDTH:1]; - // Adjust remainder by m - shift_right #(64) p4 (Rem5, RemShift, rem0); + // Adjust remainder by m (no need to adjust by + shift_right #(WIDTH) p4 (Rem5, RemShift, rem0); // Adjust Q/Rem for Signed assign tcQ = (SignN ^ SignD) & S; assign tcR = SignN & S; - // Signed Divide + + // When Dividend (N) and/or Divisor (D) are negative (first bit is '1'): // - When N and D are negative: Remainder is negative (undergoes a two's complement). // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement). // - When D is negative: Quotient is negative (undergoes a two's complement). - adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT); - adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT); + adder #(WIDTH) cpa4 ((rem0 ^ {WIDTH{tcR}}), {{WIDTH-1{1'b0}}, tcR}, remT); + adder #(WIDTH) cpa5 ((Q ^ {WIDTH{tcQ}}), {{WIDTH-1{1'b0}}, tcQ}, QT); // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec) - exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf); - + exception_int #(WIDTH) exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf); + endmodule // int32div -module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, - enable, otfzero, shiftResult); +// Division by Recurrence (r=4) +module divide4 #(parameter WIDTH=64) + (Q, rem0, quotient, op1, op2, clk, reset, state0, + enable, otfzero, shiftResult); - input logic [63:0] op1, op2; - input logic clk, state0; - input logic reset; - input logic enable; - input logic otfzero; - input logic shiftResult; + input logic [WIDTH-1:0] op1, op2; + input logic clk, state0; + input logic reset; + input logic enable; + input logic otfzero; + input logic shiftResult; - output logic [64:0] rem0; - output logic [64:0] Q; - output logic [3:0] quotient; + output logic [WIDTH:0] rem0; + output logic [WIDTH:0] Q; + output logic [3:0] quotient; - logic [67:0] Sum, Carry; - logic [64:0] Qstar; - logic [64:0] QMstar; - logic [7:0] qtotal; - logic [67:0] SumN, CarryN, SumN2, CarryN2; - logic [67:0] divi1, divi2, divi1c, divi2c, dive1; - logic [67:0] mdivi_temp, mdivi; - logic zero; - logic [1:0] qsel; - logic [1:0] Qin, QMin; - logic CshiftQ, CshiftQM; - logic [67:0] rem1, rem2, rem3; - logic [67:0] SumR, CarryR; - logic [64:0] Qt; + logic [WIDTH+3:0] Sum, Carry; + logic [WIDTH:0] Qstar; + logic [WIDTH:0] QMstar; + logic [7:0] qtotal; + logic [WIDTH+3:0] SumN, CarryN, SumN2, CarryN2; + logic [WIDTH+3:0] divi1, divi2, divi1c, divi2c, dive1; + logic [WIDTH+3:0] mdivi_temp, mdivi; + logic zero; + logic [1:0] qsel; + logic [1:0] Qin, QMin; + logic CshiftQ, CshiftQM; + logic [WIDTH+3:0] rem1, rem2, rem3; + logic [WIDTH+3:0] SumR, CarryR; + logic [WIDTH:0] Qt; // Create one's complement values of Divisor (for q*D) assign divi1 = {3'h0, op2, 1'b0}; @@ -190,42 +191,42 @@ module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, assign divi1c = ~divi1; assign divi2c = ~divi2; // Shift x1 if not mod k - mux2 #(68) mx1 ({3'b000, op1, 1'b0}, {4'h0, op1}, shiftResult, dive1); + mux2 #(WIDTH+4) mx1 ({3'b000, op1, 1'b0}, {4'h0, op1}, shiftResult, dive1); // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D) - mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN); - mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN); + mux2 #(WIDTH+4) mx2 ({CarryN2[WIDTH+1:0], 2'h0}, {WIDTH+4{1'b0}}, state0, CarryN); + mux2 #(WIDTH+4) mx3 ({SumN2[WIDTH+1:0], 2'h0}, dive1, state0, SumN); // Simplify QST - adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal); + adder #(8) cpa1 (SumN[WIDTH+3:WIDTH-4], CarryN[WIDTH+3:WIDTH-4], qtotal); // q = {+2, +1, -1, -2} else q = 0 - qst4 pd1 (qtotal[7:1], divi1[63:61], quotient); + qst4 pd1 (qtotal[7:1], divi1[WIDTH-1:WIDTH-3], quotient); assign ulp = quotient[2]|quotient[3]; assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]); // Map to binary encoding assign qsel[1] = quotient[3]|quotient[2]; assign qsel[0] = quotient[3]|quotient[1]; - mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp); - mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi); - csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry); + mux4 #(WIDTH+4) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp); + mux2 #(WIDTH+4) mx5 (mdivi_temp, {WIDTH+4{1'b0}}, zero, mdivi); + csa #(WIDTH+4) csa1 (mdivi, SumN, {CarryN[WIDTH+3:1], ulp}, Sum, Carry); // regs : save CSA - flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2); - flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2); + flopenr #(WIDTH+4) reg1 (clk, reset, enable, Sum, SumN2); + flopenr #(WIDTH+4) reg2 (clk, reset, enable, Carry, CarryN2); // OTF ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM); - otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, - otfzero, enable, Qstar, QMstar); + otf #(WIDTH+1) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, + otfzero, enable, Qstar, QMstar); // Correction and generation of Remainder - adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1); + adder #(WIDTH+4) cpa2 (SumN2[WIDTH+3:0], CarryN2[WIDTH+3:0], rem1); // Add back +D as correction - csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR); - adder #(68) cpa3 (SumR, CarryR, rem2); + csa #(WIDTH+4) csa2 (CarryN2[WIDTH+3:0], SumN2[WIDTH+3:0], divi1, SumR, CarryR); + adder #(WIDTH+4) cpa3 (SumR, CarryR, rem2); // Choose remainder (Rem or Rem+D) - mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3); + mux2 #(WIDTH+4) mx6 (rem1, rem2, rem1[WIDTH+3], rem3); // Choose correct Q or QM - mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt); + mux2 #(WIDTH+1) mx7 (Qstar, QMstar, rem1[WIDTH+3], Qt); // Final results - assign rem0 = rem3[64:0]; + assign rem0 = rem3[WIDTH:0]; assign Q = Qt; endmodule // divide4x64 @@ -304,10 +305,9 @@ module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c, fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]); end endgenerate - //assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0}; // trmimmed excess bit dh 5/3/21 - assign carry = {carry_temp[WIDTH-1:1], 1'b0}; + assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0}; -endmodule // adder +endmodule // csa module eqcmp #(parameter WIDTH = 8) (input logic [WIDTH-1:0] a, b, @@ -490,26 +490,24 @@ module lz64 (ZP, ZV, B); endmodule // lz64 // FSM Control for Integer Divider -module fsm64 (en, state0, done, divdone, otfzero, divBusy, - start, error, NumIter, clk, reset); +module fsm64 #(parameter WIDTH=6) + (en, state0, done, otfzero, divBusy, start, error, NumIter, clk, reset); - input logic [5:0] NumIter; - input logic clk; - input logic reset; - input logic start; - input logic error; + input logic [WIDTH-1:0] NumIter; + input logic clk; + input logic reset; + input logic start; + input logic error; - output logic done; - output logic en; - output logic state0; - output logic divdone; - output logic otfzero; - output logic divBusy; + output logic done; + output logic en; + output logic state0; + output logic otfzero; + output logic divBusy; - logic LT, EQ; - logic Divide0; - logic [5:0] CURRENT_STATE; - logic [5:0] NEXT_STATE; + logic LT, EQ; + logic [5:0] CURRENT_STATE; + logic [5:0] NEXT_STATE; parameter [5:0] S0=6'd0, S1=6'd1, S2=6'd2, @@ -534,12 +532,8 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, CURRENT_STATE<=NEXT_STATE; end - // Going to cheat and hard code number of states - // needed into FSM instead of using a counter - // FIXME: could counter be better - // Cheated and made 8 - let synthesis do its magic - magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {2'h0, NumIter}); + magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {{8-WIDTH{1'b0}}, NumIter}); always @(CURRENT_STATE or start) begin @@ -552,7 +546,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; divBusy = 1'b0; state0 = 1'b0; - divdone = 1'b0; done = 1'b0; NEXT_STATE <= S0; end @@ -560,30 +553,21 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, begin otfzero = 1'b0; en = 1'b1; - divBusy = 1'b1; + divBusy = 1'b1; state0 = 1'b1; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; done = 1'b0; - divdone = 1'b0; NEXT_STATE <= S1; end end S1: begin - otfzero = 1'b0; - divBusy = 1'b1; + otfzero = 1'b0; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S2; end else @@ -591,8 +575,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S2; + NEXT_STATE <= S36; end end // case: S1 S2: @@ -604,10 +587,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S3; end // if (LT|EQ) else @@ -615,8 +594,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S3; + NEXT_STATE <= S36; end end // case: S2 S3: @@ -628,10 +606,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S4; end else @@ -639,8 +613,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S4; + NEXT_STATE <= S36; end end // case: S3 S4: @@ -652,10 +625,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S5; end else @@ -663,8 +632,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S5; + NEXT_STATE <= S36; end end // case: S4 S5: @@ -676,10 +644,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S6; end // if (LT|EQ) else @@ -687,8 +651,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S6; + NEXT_STATE <= S36; end end // case: S5 S6: @@ -700,10 +663,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S7; end // if (LT|EQ) else @@ -711,8 +670,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S7; + NEXT_STATE <= S36; end end // case: S6 S7: @@ -724,10 +682,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S8; end // if (LT|EQ) else @@ -735,8 +689,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S8; + NEXT_STATE <= S36; end end // case: S7 S8: @@ -748,10 +701,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S9; end // if (LT|EQ) else @@ -759,8 +708,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S9; + NEXT_STATE <= S36; end end // case: S8 S9: @@ -772,10 +720,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S10; end // if (LT|EQ) else @@ -783,8 +727,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S10; + NEXT_STATE <= S36; end end // case: S9 S10: @@ -796,10 +739,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S11; end // if (LT|EQ) else @@ -807,8 +746,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S11; + NEXT_STATE <= S36; end end // case: S10 S11: @@ -820,10 +758,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S12; end // if (LT|EQ) else @@ -831,8 +765,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S12; + NEXT_STATE <= S36; end end // case: S11 S12: @@ -844,10 +777,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S13; end // if (LT|EQ) else @@ -855,8 +784,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S13; + NEXT_STATE <= S36; end end // case: S12 S13: @@ -868,10 +796,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S14; end // if (LT|EQ) else @@ -879,23 +803,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S14; + NEXT_STATE <= S36; end end // case: S13 S14: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S15; end // if (LT|EQ) else @@ -903,23 +822,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S15; + NEXT_STATE <= S36; end end // case: S14 S15: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S16; end // if (LT|EQ) else @@ -927,23 +841,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S16; + NEXT_STATE <= S36; end end // case: S15 S16: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S17; end // if (LT|EQ) else @@ -951,23 +860,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S17; + NEXT_STATE <= S36; end end // case: S16 S17: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S18; end // if (LT|EQ) else @@ -975,23 +879,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S18; + NEXT_STATE <= S36; end end // case: S17 S18: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S19; end // if (LT|EQ) else @@ -999,23 +898,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S19; + NEXT_STATE <= S36; end end // case: S18 S19: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S20; end // if (LT|EQ) else @@ -1023,23 +917,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S20; + NEXT_STATE <= S36; end end // case: S19 S20: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S21; end // if (LT|EQ) else @@ -1047,23 +936,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S21; + NEXT_STATE <= S36; end end // case: S20 S21: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S22; end // if (LT|EQ) else @@ -1071,23 +955,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S22; + NEXT_STATE <= S36; end end // case: S21 S22: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S23; end // if (LT|EQ) else @@ -1095,23 +974,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S23; + NEXT_STATE <= S36; end end // case: S22 S23: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S24; end // if (LT|EQ) else @@ -1119,23 +993,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S24; + NEXT_STATE <= S36; end end // case: S23 S24: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S25; end // if (LT|EQ) else @@ -1143,23 +1012,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S25; + NEXT_STATE <= S36; end end // case: S24 S25: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S26; end // if (LT|EQ) else @@ -1167,23 +1031,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S26; + NEXT_STATE <= S36; end end // case: S25 S26: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S27; end // if (LT|EQ) else @@ -1191,23 +1050,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S27; + NEXT_STATE <= S36; end end // case: S26 S27: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S28; end // if (LT|EQ) else @@ -1215,23 +1069,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S28; + NEXT_STATE <= S36; end end // case: S27 S28: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S29; end // if (LT|EQ) else @@ -1239,23 +1088,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S29; + NEXT_STATE <= S36; end end // case: S28 S29: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S30; end // if (LT|EQ) else @@ -1263,23 +1107,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S30; + NEXT_STATE <= S36; end end // case: S29 S30: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S31; end // if (LT|EQ) else @@ -1287,8 +1126,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S31; + NEXT_STATE <= S36; end end // case: S30 S31: @@ -1300,10 +1138,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S32; end // if (LT|EQ) else @@ -1311,8 +1145,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S32; + NEXT_STATE <= S36; end end // case: S31 S32: @@ -1324,10 +1157,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S33; end // if (LT|EQ) else @@ -1335,8 +1164,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S33; + NEXT_STATE <= S36; end end // case: S32 S33: @@ -1348,10 +1176,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S34; end // if (LT|EQ) else @@ -1359,23 +1183,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S34; + NEXT_STATE <= S36; end end // case: S33 S34: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S35; end // if (LT|EQ) else @@ -1383,8 +1202,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S35; + NEXT_STATE <= S36; end end // case: S34 S35: @@ -1396,10 +1214,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S36; end // if (LT|EQ) else @@ -1407,7 +1221,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; NEXT_STATE <= S36; end end // case: S35 @@ -1419,12 +1232,10 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, done = 1'b1; if (EQ) begin - divdone = 1'b1; en = 1'b1; end else begin - divdone = 1'b0; en = 1'b0; end NEXT_STATE <= S0; @@ -1432,11 +1243,10 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, default: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b0; en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; NEXT_STATE <= S0; end endcase // case(CURRENT_STATE) @@ -1497,38 +1307,39 @@ module magcompare8 (LT, EQ, A, B); endmodule // magcompare8 -module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); +// RISC-V Exception Logic for Divide by 0 and Overflow (Signed Integer Divide) +module exception_int #(parameter WIDTH=8) + (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); - input logic [63:0] Q; - input logic [63:0] rem; - input logic [63:0] op1; - input logic S; - input logic div0; - input logic Max_N; - input logic D_NegOne; + input logic [WIDTH-1:0] Q; + input logic [WIDTH-1:0] rem; + input logic [WIDTH-1:0] op1; + input logic S; + input logic div0; + input logic Max_N; + input logic D_NegOne; - output logic [63:0] Qf; - output logic [63:0] remf; + output logic [WIDTH-1:0] Qf; + output logic [WIDTH-1:0] remf; - // Needs to be optimized always_comb case ({div0, S, Max_N, D_NegOne}) 4'b0000 : Qf = Q; 4'b0001 : Qf = Q; - 4'b0010 : Qf = Q; - 4'b0011 : Qf = Q; + 4'b0010 : Qf = Q; + 4'b0011 : Qf = Q; 4'b0100 : Qf = Q; - 4'b0101 : Qf = Q; + 4'b0101 : Qf = Q; 4'b0110 : Qf = Q; - 4'b0111 : Qf = {1'b1, 31'h0}; - 4'b1000 : Qf = {64{1'b1}}; - 4'b1001 : Qf = {64{1'b1}}; - 4'b1010 : Qf = {64{1'b1}}; - 4'b1011 : Qf = {64{1'b1}}; - 4'b1100 : Qf = {64{1'b1}}; - 4'b1101 : Qf = {64{1'b1}}; - 4'b1110 : Qf = {64{1'b1}}; - 4'b1111 : Qf = {64{1'b1}}; + 4'b0111 : Qf = {1'b1, {WIDTH-1{1'h0}}}; + 4'b1000 : Qf = {WIDTH{1'b1}}; + 4'b1001 : Qf = {WIDTH{1'b1}}; + 4'b1010 : Qf = {WIDTH{1'b1}}; + 4'b1011 : Qf = {WIDTH{1'b1}}; + 4'b1100 : Qf = {WIDTH{1'b1}}; + 4'b1101 : Qf = {WIDTH{1'b1}}; + 4'b1110 : Qf = {WIDTH{1'b1}}; + 4'b1111 : Qf = {WIDTH{1'b1}}; default: Qf = Q; endcase @@ -1536,18 +1347,18 @@ module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); case ({div0, S, Max_N, D_NegOne}) 4'b0000 : remf = rem; 4'b0001 : remf = rem; - 4'b0010 : remf = rem; + 4'b0010 : remf = rem; 4'b0011 : remf = rem; 4'b0100 : remf = rem; 4'b0101 : remf = rem; 4'b0110 : remf = rem; - 4'b0111 : remf = 64'h0; + 4'b0111 : remf = {WIDTH{1'h0}}; 4'b1000 : remf = op1; 4'b1001 : remf = op1; 4'b1010 : remf = op1; 4'b1011 : remf = op1; 4'b1100 : remf = op1; - 4'b1101 : remf = op1; + 4'b1101 : remf = op1; 4'b1110 : remf = op1; 4'b1111 : remf = op1; default: remf = rem; @@ -1557,4 +1368,3 @@ endmodule // exception_int /* verilator lint_on COMBDLY */ /* verilator lint_on IMPLICIT */ - diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv index 17c4aac5..f4096fd1 100644 --- a/wally-pipelined/src/muldiv/muldiv.sv +++ b/wally-pipelined/src/muldiv/muldiv.sv @@ -78,7 +78,7 @@ module muldiv ( .en(startDivideE), .clear(DivDoneE), .reset(reset), .clk(~gclk)); assign signedDivide = (Funct3E[2]&~Funct3E[1]&~Funct3E[0]) | (Funct3E[2]&Funct3E[1]&~Funct3E[0]); - div div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide); + intdiv #(`XLEN) div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide); // Added for debugging of start signal for divide assign startDivideE = MulDivE&DivStartE&~DivBusyE; @@ -93,7 +93,6 @@ module muldiv ( // Select result always_comb - // case (DivDoneE ? Funct3E_Q : Funct3E) case (Funct3E) 3'b000: PrelimResultE = ProdE[`XLEN-1:0]; 3'b001: PrelimResultE = ProdE[`XLEN*2-1:`XLEN]; From 46a232b862249262e91fd0241c48f7b662bac599 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Mon, 31 May 2021 09:16:30 -0400 Subject: [PATCH 07/19] Cosmetic changes on integer divider --- wally-pipelined/src/muldiv/div.sv | 7 ++++--- wally-pipelined/src/muldiv/muldiv.sv | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv index 107b002f..8b4e0463 100755 --- a/wally-pipelined/src/muldiv/div.sv +++ b/wally-pipelined/src/muldiv/div.sv @@ -55,7 +55,7 @@ module intdiv #(parameter WIDTH=64) logic [3:0] quotient; logic otfzero; logic shiftResult; - logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; + logic enablev, state0v, donev, oftzerov, divBusyv, ulp; logic [WIDTH-1:0] twoD; logic [WIDTH-1:0] twoN; @@ -231,6 +231,7 @@ module divide4 #(parameter WIDTH=64) endmodule // divide4x64 +// Load/Control for OTFC module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM); input logic [3:0] quot; @@ -251,8 +252,7 @@ module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM); endmodule -// On-the-fly Conversion per Ercegovac/Lang - +// On-the-fly Conversion (OTFC) module otf #(parameter WIDTH=8) (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q); @@ -317,6 +317,7 @@ module eqcmp #(parameter WIDTH = 8) endmodule // eqcmp +// QST for r=4 module qst4 (input logic [6:0] s, input logic [2:0] d, output logic [3:0] q); diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv index f4096fd1..ccabe341 100644 --- a/wally-pipelined/src/muldiv/muldiv.sv +++ b/wally-pipelined/src/muldiv/muldiv.sv @@ -47,7 +47,6 @@ module muldiv ( logic [`XLEN-1:0] MulDivResultE, MulDivResultM; logic [`XLEN-1:0] PrelimResultE; logic [`XLEN-1:0] QuotE, RemE; - //logic [`XLEN-1:0] Q, R; logic [`XLEN*2-1:0] ProdE; logic enable_q; From f6c88666cfc8dbeebfd34db85b5282636c361690 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Mon, 31 May 2021 16:11:12 -0500 Subject: [PATCH 08/19] may have fixed the global branch history predictor. The solution required a completed rewrite and understanding of how the GHR needs to be speculatively updated and repaired. --- testsBP/simple/header.h | 4 +- testsBP/simple/main.c | 4 +- wally-pipelined/src/ifu/bpred.sv | 11 ++- .../src/ifu/globalHistoryPredictor.sv | 98 +++++++++++++++---- wally-pipelined/src/ifu/gshare.sv | 41 +++++++- 5 files changed, 128 insertions(+), 30 deletions(-) diff --git a/testsBP/simple/header.h b/testsBP/simple/header.h index 6def656f..f3a62da3 100644 --- a/testsBP/simple/header.h +++ b/testsBP/simple/header.h @@ -5,5 +5,7 @@ int fail(); int simple_csrbr_test(); int lbu_test(); int icache_spill_test(); -void global_hist_test(); +void global_hist_1_space_test(); +void global_hist_2_space_test(); +void global_hist_3_space_test(); #endif diff --git a/testsBP/simple/main.c b/testsBP/simple/main.c index 036a351d..7bf6b475 100644 --- a/testsBP/simple/main.c +++ b/testsBP/simple/main.c @@ -2,7 +2,9 @@ int main(){ //int res = icache_spill_test(); - global_hist_test(); + global_hist_3_space_test(); + global_hist_2_space_test(); + global_hist_1_space_test(); int res = 1; if (res < 0) { fail(); diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv index c5b4dde4..9beaa959 100644 --- a/wally-pipelined/src/ifu/bpred.sv +++ b/wally-pipelined/src/ifu/bpred.sv @@ -90,12 +90,13 @@ module bpred .reset(reset), .*, // Stalls and flushes .LookUpPC(PCNextF), - .Prediction(BPPredF), + .BPPredF(BPPredF), // update - .UpdatePC(PCE), - .UpdateEN(InstrClassE[0] & ~StallE), - .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF), + .BPPredD(BPPredD), + .InstrClassE(InstrClassE), + .BPInstrClassE(BPInstrClassE), .BPPredDirWrongE(BPPredDirWrongE), + .UpdatePC(PCE), .PCSrcE(PCSrcE), .UpdatePrediction(UpdateBPPredE)); end else if (`BPTYPE == "BPGSHARE") begin:Predictor @@ -108,6 +109,8 @@ module bpred // update .UpdatePC(PCE), .UpdateEN(InstrClassE[0] & ~StallE), + .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF), + .BPPredDirWrongE(BPPredDirWrongE), .PCSrcE(PCSrcE), .UpdatePrediction(UpdateBPPredE)); end diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv index fadbf004..b2ac1991 100644 --- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv +++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv @@ -34,49 +34,108 @@ module globalHistoryPredictor input logic reset, input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, input logic [`XLEN-1:0] LookUpPC, - output logic [1:0] Prediction, + output logic [1:0] BPPredF, // update + input logic [1:0] BPPredD, + input logic [4:0] InstrClassE, + input logic [4:0] BPInstrClassE, + input logic [4:0] BPInstrClassD, + input logic [4:0] BPInstrClassF, + input logic BPPredDirWrongE, + input logic [`XLEN-1:0] UpdatePC, - input logic UpdateEN, PCSrcE, - input logic SpeculativeUpdateEn, BPPredDirWrongE, + input logic PCSrcE, input logic [1:0] UpdatePrediction ); - logic [k-1:0] GHRF, GHRFNext, GHRD, GHRE, GHRLookup; + logic [k+1:0] GHR, GHRNext; + logic [k-1:0] PHTUpdateAdr, PHTUpdateAdr0, PHTUpdateAdr1; + logic PHTUpdateEN; + logic BPClassWrongNonCFI; + logic BPClassWrongCFI; + logic BPClassRightNonCFI; + + +/* -----\/----- EXCLUDED -----\/----- + logic [k-1:0] GHRD, GHRE, GHRLookup; logic FlushedD, FlushedE; + -----/\----- EXCLUDED -----/\----- */ + + + logic [6:0] GHRMuxSel; + logic GHRUpdateEN; + + assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0]; + assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0]; + assign BPClassWrongNonCFI = BPInstrClassE[0] & ~InstrClassE[0]; + assign BPClassRightBPWrong = BPInstrClassE[0] & InstrClassE[0] & BPPredDirWrongE; + assign BPClassRightBPRight = BPInstrClassE[0] & InstrClassE[0] & ~BPPredDirWrongE; + + // GHR update selection, 1 hot encoded. + assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight); - // if the prediction is wrong we need to restore the ghr. - assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : - {Prediction[1], GHRF[k-1:1]}; + assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0]; + assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]); - flopenr #(k) GlobalHistoryRegister(.clk(clk), - .reset(reset), - .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)), - .d(GHRFNext), - .q(GHRF)); + + assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0]; + + + + assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0]; + assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0]; + assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight)); + assign GHRUpdateEN = (| GHRMuxSel[5:1] & ~StallE) | GHRMuxSel[6] & ~StallF; + + // hoping this created a AND-OR mux. + always_comb begin + case (GHRMuxSel) + 7'b000_0001: GHRNext = GHR[k-1+2:0]; // no change + 7'b000_0010: GHRNext = {GHR[k-2+2:0], PCSrcE}; // branch update + 7'b000_0100: GHRNext = {1'b0, GHR[k+1:1]}; // repair 1 + 7'b000_1000: GHRNext = {GHR[k-1+2:1], PCSrcE}; // branch update with mis prediction correction + 7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2 + 7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1 + 7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update + //7'b100_0000: GHRNext = {k+1{1'bx}}; // speculative update + default: GHRNext = GHR[k-1+2:0]; + endcase + end + + flopenr #(k+2) GlobalHistoryRegister(.clk(clk), + .reset(reset), + .en((GHRUpdateEN)), + .d(GHRNext), + .q(GHR)); // if actively updating the GHR at the time of prediction we want to us - // GHRFNext as the lookup rather than GHRF. + // GHRNext as the lookup rather than GHR. - assign GHRLookup = UpdateEN ? GHRFNext : GHRF; + //assign GHRLookup = GHRUpdateEN ? GHRNext : GHR; + assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0]; + assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1]; + assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0; + assign PHTUpdateEN = InstrClassE[0] & ~StallE; + // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT SRAM2P1R1W #(k, 2) PHT(.clk(clk), .reset(reset), - .RA1(GHRF), - .RD1(Prediction), + .RA1(GHR[k-1:0]), + .RD1(BPPredF), .REN1(~StallF), - .WA1(GHRE), + .WA1(PHTUpdateAdr), .WD1(UpdatePrediction), - .WEN1(UpdateEN), + .WEN1(PHTUpdateEN), .BitWEN1(2'b11)); +/* -----\/----- EXCLUDED -----\/----- flopenr #(k) GlobalHistoryRegisterD(.clk(clk), .reset(reset), .en(~StallD & ~FlushedE), - .d(GHRF), + .d(GHR), .q(GHRD)); flopenr #(k) GlobalHistoryRegisterE(.clk(clk), @@ -97,6 +156,7 @@ module globalHistoryPredictor .en(~StallE), .d(FlushE | FlushedD), .q(FlushedE)); + -----/\----- EXCLUDED -----/\----- */ endmodule diff --git a/wally-pipelined/src/ifu/gshare.sv b/wally-pipelined/src/ifu/gshare.sv index 4d31e519..3cc73be8 100644 --- a/wally-pipelined/src/ifu/gshare.sv +++ b/wally-pipelined/src/ifu/gshare.sv @@ -38,28 +38,32 @@ module gsharePredictor // update input logic [`XLEN-1:0] UpdatePC, input logic UpdateEN, PCSrcE, + input logic SpeculativeUpdateEn, BPPredDirWrongE, input logic [1:0] UpdatePrediction ); - logic [k-1:0] GHRF, GHRFNext; + logic [k-1:0] GHRF, GHRFNext, GHRD, GHRE; //logic [k-1:0] LookUpPCIndexD, LookUpPCIndexE; logic [k-1:0] LookUpPCIndex, UpdatePCIndex; logic [1:0] PredictionMemory; logic DoForwarding, DoForwardingF; logic [1:0] UpdatePredictionF; + logic FlushedD, FlushedE; - assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; + // if the prediction is wrong we need to restore the ghr. + assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : + {Prediction[1], GHRF[k-1:1]}; flopenr #(k) GlobalHistoryRegister(.clk(clk), .reset(reset), - .en(UpdateEN), + .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)), .d(GHRFNext), .q(GHRF)); // for gshare xor the PC with the GHR - assign UpdatePCIndex = GHRFNext ^ UpdatePC[k:1]; + assign UpdatePCIndex = GHRE ^ UpdatePC[k:1]; assign LookUpPCIndex = GHRF ^ LookUpPC[k:1]; // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT // GHR referes to the address that the past k branches points to in the prediction stage @@ -67,7 +71,7 @@ module gsharePredictor SRAM2P1R1W #(k, 2) PHT(.clk(clk), .reset(reset), .RA1(LookUpPCIndex), - .RD1(PredictionMemory), + .RD1(Prediction), .REN1(~StallF), .WA1(UpdatePCIndex), .WD1(UpdatePrediction), @@ -75,6 +79,32 @@ module gsharePredictor .BitWEN1(2'b11)); + flopenr #(k) GlobalHistoryRegisterD(.clk(clk), + .reset(reset), + .en(~StallD & ~FlushedE), + .d(GHRF), + .q(GHRD)); + + flopenr #(k) GlobalHistoryRegisterE(.clk(clk), + .reset(reset), + .en(~StallE & ~ FlushedE), + .d(GHRD), + .q(GHRE)); + + + flopenr #(1) flushedDReg(.clk(clk), + .reset(reset), + .en(~StallD), + .d(FlushD), + .q(FlushedD)); + + flopenr #(1) flushedEReg(.clk(clk), + .reset(reset), + .en(~StallE), + .d(FlushE | FlushedD), + .q(FlushedE)); + +/* -----\/----- EXCLUDED -----\/----- // need to forward when updating to the same address as reading. // first we compare to see if the update and lookup addreses are the same assign DoForwarding = LookUpPCIndex == UpdatePCIndex; @@ -92,6 +122,7 @@ module gsharePredictor .q(UpdatePredictionF)); assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory; + -----/\----- EXCLUDED -----/\----- */ //pipeline for GHR /* -----\/----- EXCLUDED -----\/----- From ddbdd0d5a27d485537994516a0671225c3cb7219 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Mon, 31 May 2021 23:27:42 -0400 Subject: [PATCH 09/19] Modify muldiv.sv to handle W instructions for 64-bits --- wally-pipelined/src/muldiv/div.sv | 1 - wally-pipelined/src/muldiv/muldiv.sv | 17 ++++++++++++++--- wally-pipelined/testbench/testbench-imperas.sv | 10 +++++----- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv index 8b4e0463..10af5eee 100755 --- a/wally-pipelined/src/muldiv/div.sv +++ b/wally-pipelined/src/muldiv/div.sv @@ -87,7 +87,6 @@ module intdiv #(parameter WIDTH=64) // is 0 and thus a divide by 0 exception. This div0 // exception is given to FSM to tell the operation to // quit gracefully. - lzd_hier #(WIDTH) p1 (.ZP(P), .ZV(V), .B(twoD)); shift_left #(WIDTH) p2 (twoD, P, op2); assign op1 = twoN; diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv index ccabe341..0c26a5df 100644 --- a/wally-pipelined/src/muldiv/muldiv.sv +++ b/wally-pipelined/src/muldiv/muldiv.sv @@ -53,6 +53,7 @@ module muldiv ( logic [2:0] Funct3E_Q; logic div0error; logic [`XLEN-1:0] N, D; + logic [`XLEN-1:0] Num0, Den0; logic gclk; logic DivStartE; @@ -69,13 +70,23 @@ module muldiv ( end assign gclk = enable_q & clk; + // Handle sign extension for W-type instructions + if (`XLEN == 64) begin // RV64 has W-type instructions + assign Num0 = W64E ? {{32{SrcAE[31]&signedDivide}}, SrcAE[31:0]} : SrcAE; + assign Den0 = W64E ? {{32{SrcBE[31]&signedDivide}}, SrcBE[31:0]} : SrcBE; + end else begin // RV32 has no W-type instructions + assign Num0 = SrcAE; + assign Den0 = SrcAE; + end + // capture the Numerator/Denominator - flopenrc #(`XLEN) reg_num (.d(SrcAE), .q(N), + flopenrc #(`XLEN) reg_num (.d(Num0), .q(N), .en(startDivideE), .clear(DivDoneE), .reset(reset), .clk(~gclk)); - flopenrc #(`XLEN) reg_den (.d(SrcBE), .q(D), + flopenrc #(`XLEN) reg_den (.d(Den0), .q(D), .en(startDivideE), .clear(DivDoneE), - .reset(reset), .clk(~gclk)); + .reset(reset), .clk(~gclk)); + assign signedDivide = (Funct3E[2]&~Funct3E[1]&~Funct3E[0]) | (Funct3E[2]&Funct3E[1]&~Funct3E[0]); intdiv #(`XLEN) div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide); diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index ea693900..6d8f1049 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -166,12 +166,12 @@ string tests32f[] = '{ "rv64m/I-MULW-01", "3000", "rv64m/I-DIV-01", "3000", "rv64m/I-DIVU-01", "3000", - //"rv64m/I-DIVUW-01", "3000", - //"rv64m/I-DIVW-01", "3000", + "rv64m/I-DIVUW-01", "3000", + "rv64m/I-DIVW-01", "3000", "rv64m/I-REM-01", "3000", - "rv64m/I-REMU-01", "3000" - //"rv64m/I-REMUW-01", "3000", - //"rv64m/I-REMW-01", "3000" + "rv64m/I-REMU-01", "3000", + "rv64m/I-REMUW-01", "3000", + "rv64m/I-REMW-01", "3000" }; string tests64ic[] = '{ From 857f59ab5c51e146d1cdf121443297f7ac079246 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 10:57:43 -0500 Subject: [PATCH 10/19] Now have global history working correctly. --- testsBP/crt0/Makefile | 8 +- testsBP/simple/header.h | 1 + testsBP/simple/main.c | 3 +- wally-pipelined/config/rv64BP/wally-config.vh | 6 +- wally-pipelined/src/ifu/bpred.sv | 29 ++-- .../src/ifu/globalHistoryPredictor.sv | 62 ++----- wally-pipelined/src/ifu/gshare.sv | 159 ------------------ .../testbench/testbench-imperas.sv | 5 +- 8 files changed, 38 insertions(+), 235 deletions(-) delete mode 100644 wally-pipelined/src/ifu/gshare.sv diff --git a/testsBP/crt0/Makefile b/testsBP/crt0/Makefile index b42e86cb..2af43a40 100644 --- a/testsBP/crt0/Makefile +++ b/testsBP/crt0/Makefile @@ -4,12 +4,12 @@ ROOT := .. LIBRARY_DIRS := LIBRARY_FILES := -MARCH :=-march=rv64ic -MABI :=-mabi=lp64 +MARCH :=-march=rv64imfdc +MABI :=-mabi=lp64d LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles -AFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -W -CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -mcmodel=medany -O2 +AFLAGS =$(MARCH) $(MABI) -W +CFLAGS =$(MARCH) $(MABI) -mcmodel=medany -O2 AS=riscv64-unknown-elf-as CC=riscv64-unknown-elf-gcc AR=riscv64-unknown-elf-ar diff --git a/testsBP/simple/header.h b/testsBP/simple/header.h index f3a62da3..aab8973f 100644 --- a/testsBP/simple/header.h +++ b/testsBP/simple/header.h @@ -5,6 +5,7 @@ int fail(); int simple_csrbr_test(); int lbu_test(); int icache_spill_test(); +void global_hist_0_space_test(); void global_hist_1_space_test(); void global_hist_2_space_test(); void global_hist_3_space_test(); diff --git a/testsBP/simple/main.c b/testsBP/simple/main.c index 7bf6b475..564b474e 100644 --- a/testsBP/simple/main.c +++ b/testsBP/simple/main.c @@ -4,7 +4,8 @@ int main(){ //int res = icache_spill_test(); global_hist_3_space_test(); global_hist_2_space_test(); - global_hist_1_space_test(); + global_hist_1_space_test(); + global_hist_0_space_test(); int res = 1; if (res < 0) { fail(); diff --git a/wally-pipelined/config/rv64BP/wally-config.vh b/wally-pipelined/config/rv64BP/wally-config.vh index fd482bfd..a9dbb1bd 100644 --- a/wally-pipelined/config/rv64BP/wally-config.vh +++ b/wally-pipelined/config/rv64BP/wally-config.vh @@ -32,7 +32,7 @@ `define XLEN 64 //`define MISA (32'h00000105) -`define MISA (32'h00000104 | 1<<5 | 1<<18 | 1 << 20 | 1 << 12 | 1 << 0) +`define MISA (32'h00000104 | 1 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0) `define A_SUPPORTED ((`MISA >> 0) % 2 == 1) `define C_SUPPORTED ((`MISA >> 2) % 2 == 1) `define D_SUPPORTED ((`MISA >> 3) % 2 == 1) @@ -107,8 +107,8 @@ /* verilator lint_off ASSIGNDLY */ /* verilator lint_off PINCONNECTEMPTY */ -`define TWO_BIT_PRELOAD "../config/rv64icfd/twoBitPredictor.txt" -`define BTB_PRELOAD "../config/rv64icfd/BTBPredictor.txt" +`define TWO_BIT_PRELOAD "../config/rv64BP/twoBitPredictor.txt" +`define BTB_PRELOAD "../config/rv64BP/BTBPredictor.txt" `define BPRED_ENABLED 1 //`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE `define BPTYPE "BPGLOBAL" // BPTWOBIT or "BPGSHARE" or BPLOCALPAg or BPGSHARE diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv index 9beaa959..92471c57 100644 --- a/wally-pipelined/src/ifu/bpred.sv +++ b/wally-pipelined/src/ifu/bpred.sv @@ -89,30 +89,29 @@ module bpred globalHistoryPredictor DirPredictor(.clk(clk), .reset(reset), .*, // Stalls and flushes - .LookUpPC(PCNextF), + .PCNextF(PCNextF), .BPPredF(BPPredF), // update - .BPPredD(BPPredD), .InstrClassE(InstrClassE), .BPInstrClassE(BPInstrClassE), .BPPredDirWrongE(BPPredDirWrongE), - .UpdatePC(PCE), + .PCE(PCE), .PCSrcE(PCSrcE), - .UpdatePrediction(UpdateBPPredE)); + .UpdateBPPredE(UpdateBPPredE)); end else if (`BPTYPE == "BPGSHARE") begin:Predictor gsharePredictor DirPredictor(.clk(clk), - .reset(reset), - .*, // Stalls and flushes - .LookUpPC(PCNextF), - .Prediction(BPPredF), - // update - .UpdatePC(PCE), - .UpdateEN(InstrClassE[0] & ~StallE), - .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF), - .BPPredDirWrongE(BPPredDirWrongE), - .PCSrcE(PCSrcE), - .UpdatePrediction(UpdateBPPredE)); + .reset(reset), + .*, // Stalls and flushes + .PCNextF(PCNextF), + .BPPredF(BPPredF), + // update + .InstrClassE(InstrClassE), + .BPInstrClassE(BPInstrClassE), + .BPPredDirWrongE(BPPredDirWrongE), + .PCE(PCE), + .PCSrcE(PCSrcE), + .UpdateBPPredE(UpdateBPPredE)); end else if (`BPTYPE == "BPLOCALPAg") begin:Predictor diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv index b2ac1991..516de633 100644 --- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv +++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv @@ -33,19 +33,18 @@ module globalHistoryPredictor (input logic clk, input logic reset, input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, - input logic [`XLEN-1:0] LookUpPC, + input logic [`XLEN-1:0] PCNextF, output logic [1:0] BPPredF, // update - input logic [1:0] BPPredD, input logic [4:0] InstrClassE, input logic [4:0] BPInstrClassE, input logic [4:0] BPInstrClassD, input logic [4:0] BPInstrClassF, input logic BPPredDirWrongE, - input logic [`XLEN-1:0] UpdatePC, + input logic [`XLEN-1:0] PCE, input logic PCSrcE, - input logic [1:0] UpdatePrediction + input logic [1:0] UpdateBPPredE ); logic [k+1:0] GHR, GHRNext; @@ -54,17 +53,10 @@ module globalHistoryPredictor logic BPClassWrongNonCFI; logic BPClassWrongCFI; logic BPClassRightNonCFI; - - -/* -----\/----- EXCLUDED -----\/----- - logic [k-1:0] GHRD, GHRE, GHRLookup; - - logic FlushedD, FlushedE; - -----/\----- EXCLUDED -----/\----- */ - logic [6:0] GHRMuxSel; logic GHRUpdateEN; + logic [k-1:0] GHRLookup; assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0]; assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0]; @@ -75,15 +67,9 @@ module globalHistoryPredictor // GHR update selection, 1 hot encoded. assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight); - assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0]; - assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]); - - assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0]; - - - + assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]); assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0]; assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0]; assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight)); @@ -99,7 +85,6 @@ module globalHistoryPredictor 7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2 7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1 7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update - //7'b100_0000: GHRNext = {k+1{1'bx}}; // speculative update default: GHRNext = GHR[k-1+2:0]; endcase end @@ -113,50 +98,23 @@ module globalHistoryPredictor // if actively updating the GHR at the time of prediction we want to us // GHRNext as the lookup rather than GHR. - //assign GHRLookup = GHRUpdateEN ? GHRNext : GHR; - assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0]; assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1]; assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0; assign PHTUpdateEN = InstrClassE[0] & ~StallE; + + assign GHRLookup = |GHRMuxSel[6:1] ? GHRNext[k-1:0] : GHR[k-1:0]; // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT SRAM2P1R1W #(k, 2) PHT(.clk(clk), .reset(reset), - .RA1(GHR[k-1:0]), + //.RA1(GHR[k-1:0]), + .RA1(GHRLookup), .RD1(BPPredF), .REN1(~StallF), .WA1(PHTUpdateAdr), - .WD1(UpdatePrediction), + .WD1(UpdateBPPredE), .WEN1(PHTUpdateEN), .BitWEN1(2'b11)); -/* -----\/----- EXCLUDED -----\/----- - flopenr #(k) GlobalHistoryRegisterD(.clk(clk), - .reset(reset), - .en(~StallD & ~FlushedE), - .d(GHR), - .q(GHRD)); - - flopenr #(k) GlobalHistoryRegisterE(.clk(clk), - .reset(reset), - .en(~StallE & ~ FlushedE), - .d(GHRD), - .q(GHRE)); - - - flopenr #(1) flushedDReg(.clk(clk), - .reset(reset), - .en(~StallD), - .d(FlushD), - .q(FlushedD)); - - flopenr #(1) flushedEReg(.clk(clk), - .reset(reset), - .en(~StallE), - .d(FlushE | FlushedD), - .q(FlushedE)); - -----/\----- EXCLUDED -----/\----- */ - - endmodule diff --git a/wally-pipelined/src/ifu/gshare.sv b/wally-pipelined/src/ifu/gshare.sv deleted file mode 100644 index 3cc73be8..00000000 --- a/wally-pipelined/src/ifu/gshare.sv +++ /dev/null @@ -1,159 +0,0 @@ -/////////////////////////////////////////// -// gshare.sv -// -// Written: Shreya Sanghai -// Email: ssanghai@hmc.edu -// Created: March 16, 2021 -// Modified: -// -// Purpose: Gshare predictor with parameterized global history register -// -// A component of the Wally configurable RISC-V project. -// -// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, -// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software -// is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT -// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -/////////////////////////////////////////// - -`include "wally-config.vh" - -module gsharePredictor - #(parameter int k = 10 - ) - (input logic clk, - input logic reset, - input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, - input logic [`XLEN-1:0] LookUpPC, - output logic [1:0] Prediction, - // update - input logic [`XLEN-1:0] UpdatePC, - input logic UpdateEN, PCSrcE, - input logic SpeculativeUpdateEn, BPPredDirWrongE, - input logic [1:0] UpdatePrediction - - ); - - logic [k-1:0] GHRF, GHRFNext, GHRD, GHRE; - //logic [k-1:0] LookUpPCIndexD, LookUpPCIndexE; - logic [k-1:0] LookUpPCIndex, UpdatePCIndex; - logic [1:0] PredictionMemory; - logic DoForwarding, DoForwardingF; - logic [1:0] UpdatePredictionF; - logic FlushedD, FlushedE; - - // if the prediction is wrong we need to restore the ghr. - assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : - {Prediction[1], GHRF[k-1:1]}; - - flopenr #(k) GlobalHistoryRegister(.clk(clk), - .reset(reset), - .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)), - .d(GHRFNext), - .q(GHRF)); - - - // for gshare xor the PC with the GHR - assign UpdatePCIndex = GHRE ^ UpdatePC[k:1]; - assign LookUpPCIndex = GHRF ^ LookUpPC[k:1]; - // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT - // GHR referes to the address that the past k branches points to in the prediction stage - // GHRE refers to the address that the past k branches points to in the exectution stage - SRAM2P1R1W #(k, 2) PHT(.clk(clk), - .reset(reset), - .RA1(LookUpPCIndex), - .RD1(Prediction), - .REN1(~StallF), - .WA1(UpdatePCIndex), - .WD1(UpdatePrediction), - .WEN1(UpdateEN), - .BitWEN1(2'b11)); - - - flopenr #(k) GlobalHistoryRegisterD(.clk(clk), - .reset(reset), - .en(~StallD & ~FlushedE), - .d(GHRF), - .q(GHRD)); - - flopenr #(k) GlobalHistoryRegisterE(.clk(clk), - .reset(reset), - .en(~StallE & ~ FlushedE), - .d(GHRD), - .q(GHRE)); - - - flopenr #(1) flushedDReg(.clk(clk), - .reset(reset), - .en(~StallD), - .d(FlushD), - .q(FlushedD)); - - flopenr #(1) flushedEReg(.clk(clk), - .reset(reset), - .en(~StallE), - .d(FlushE | FlushedD), - .q(FlushedE)); - -/* -----\/----- EXCLUDED -----\/----- - // need to forward when updating to the same address as reading. - // first we compare to see if the update and lookup addreses are the same - assign DoForwarding = LookUpPCIndex == UpdatePCIndex; - - // register the update value and the forwarding signal into the Fetch stage - // TODO: add stall logic *** - flopr #(1) DoForwardingReg(.clk(clk), - .reset(reset), - .d(DoForwarding), - .q(DoForwardingF)); - - flopr #(2) UpdatePredictionReg(.clk(clk), - .reset(reset), - .d(UpdatePrediction), - .q(UpdatePredictionF)); - - assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory; - -----/\----- EXCLUDED -----/\----- */ - - //pipeline for GHR -/* -----\/----- EXCLUDED -----\/----- - flopenrc #(k) LookUpDReg(.clk(clk), - .reset(reset), - .en(~StallD), - .clear(FlushD), - .d(LookUpPCIndex), - .q(LookUpPCIndexD)); - - flopenrc #(k) LookUpEReg(.clk(clk), - .reset(reset), - .en(~StallE), - .clear(FlushE), - .d(LookUpPCIndexD), - .q(LookUpPCIndexE)); - -----/\----- EXCLUDED -----/\----- */ - -/* flopenrc #(k) GHRRegD(.clk(clk), - .reset(reset), - .en(~StallD), - .clear(FlushD), - .d(GHRF), - .q(GHRD)); - - flopenrc #(k) GHRRegE(.clk(clk), - .reset(reset), - .en(~StallE), - .clear(FlushE), - .d(GHRD), - .q(GHRE)); - -*/ -endmodule diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index ddee23a1..bb8ffbd4 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -438,8 +438,11 @@ string tests32f[] = '{ string testsBP64[] = '{ "rv64BP/simple", "10000", + "rv64BP/mmm", "1000000", + "rv64BP/linpack_bench", "1000000", + "rv64BP/sieve", "1000000", "rv64BP/qsort", "1000000", - "rv64BP/sieve", "1000000" + "rv64BP/dhrystone", "1000000" }; string tests64p[] = '{ From ab509614bb36a1db60b8017f0df1521bf9688858 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 12:14:58 -0500 Subject: [PATCH 11/19] Changed to bp config to use gshare. --- wally-pipelined/config/rv64BP/wally-config.vh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wally-pipelined/config/rv64BP/wally-config.vh b/wally-pipelined/config/rv64BP/wally-config.vh index a9dbb1bd..f85e0c22 100644 --- a/wally-pipelined/config/rv64BP/wally-config.vh +++ b/wally-pipelined/config/rv64BP/wally-config.vh @@ -111,5 +111,5 @@ `define BTB_PRELOAD "../config/rv64BP/BTBPredictor.txt" `define BPRED_ENABLED 1 //`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE -`define BPTYPE "BPGLOBAL" // BPTWOBIT or "BPGSHARE" or BPLOCALPAg or BPGSHARE +`define BPTYPE "BPGSHARE" // BPTWOBIT or "BPGLOBAL" or BPLOCALPAg or BPGSHARE `define TESTSBP 1 From 997c13a5217f0d32a40dc77dcc6a2653368bb397 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 12:41:48 -0500 Subject: [PATCH 12/19] Forgot to include the new gshare predictor file. --- wally-pipelined/src/ifu/gsharePredictor.sv | 120 +++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 wally-pipelined/src/ifu/gsharePredictor.sv diff --git a/wally-pipelined/src/ifu/gsharePredictor.sv b/wally-pipelined/src/ifu/gsharePredictor.sv new file mode 100644 index 00000000..b4a60827 --- /dev/null +++ b/wally-pipelined/src/ifu/gsharePredictor.sv @@ -0,0 +1,120 @@ +/////////////////////////////////////////// +// globalHistoryPredictor.sv +// +// Written: Shreya Sanghai +// Email: ssanghai@hmc.edu +// Created: March 16, 2021 +// Modified: +// +// Purpose: Gshare predictor with parameterized global history register +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" + +module gsharePredictor + #(parameter int k = 10 + ) + (input logic clk, + input logic reset, + input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, + input logic [`XLEN-1:0] PCNextF, + output logic [1:0] BPPredF, + // update + input logic [4:0] InstrClassE, + input logic [4:0] BPInstrClassE, + input logic [4:0] BPInstrClassD, + input logic [4:0] BPInstrClassF, + input logic BPPredDirWrongE, + + input logic [`XLEN-1:0] PCE, + input logic PCSrcE, + input logic [1:0] UpdateBPPredE + + ); + logic [k+1:0] GHR, GHRNext; + logic [k-1:0] PHTUpdateAdr, PHTUpdateAdr0, PHTUpdateAdr1; + logic PHTUpdateEN; + logic BPClassWrongNonCFI; + logic BPClassWrongCFI; + logic BPClassRightNonCFI; + + logic [6:0] GHRMuxSel; + logic GHRUpdateEN; + logic [k-1:0] GHRLookup; + + assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0]; + assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0]; + assign BPClassWrongNonCFI = BPInstrClassE[0] & ~InstrClassE[0]; + assign BPClassRightBPWrong = BPInstrClassE[0] & InstrClassE[0] & BPPredDirWrongE; + assign BPClassRightBPRight = BPInstrClassE[0] & InstrClassE[0] & ~BPPredDirWrongE; + + + // GHR update selection, 1 hot encoded. + assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight); + assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0]; + assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0]; + assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]); + assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0]; + assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0]; + assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight)); + assign GHRUpdateEN = (| GHRMuxSel[5:1] & ~StallE) | GHRMuxSel[6] & ~StallF; + + // hoping this created a AND-OR mux. + always_comb begin + case (GHRMuxSel) + 7'b000_0001: GHRNext = GHR[k-1+2:0]; // no change + 7'b000_0010: GHRNext = {GHR[k-2+2:0], PCSrcE}; // branch update + 7'b000_0100: GHRNext = {1'b0, GHR[k+1:1]}; // repair 1 + 7'b000_1000: GHRNext = {GHR[k-1+2:1], PCSrcE}; // branch update with mis prediction correction + 7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2 + 7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1 + 7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update + default: GHRNext = GHR[k-1+2:0]; + endcase + end + + flopenr #(k+2) GlobalHistoryRegister(.clk(clk), + .reset(reset), + .en((GHRUpdateEN)), + .d(GHRNext), + .q(GHR)); + + // if actively updating the GHR at the time of prediction we want to us + // GHRNext as the lookup rather than GHR. + + assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0]; + assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1]; + assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0; + assign PHTUpdateEN = InstrClassE[0] & ~StallE; + + assign GHRLookup = |GHRMuxSel[6:1] ? GHRNext[k-1:0] : GHR[k-1:0]; + + // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT + SRAM2P1R1W #(k, 2) PHT(.clk(clk), + .reset(reset), + //.RA1(GHR[k-1:0]), + .RA1(GHRLookup ^ PCNextF[k:1]), + .RD1(BPPredF), + .REN1(~StallF), + .WA1(PHTUpdateAdr ^ PCE[k:1]), + .WD1(UpdateBPPredE), + .WEN1(PHTUpdateEN), + .BitWEN1(2'b11)); + +endmodule // gsharePredictor From fe22fd2db8cc6f66ee5021ae4095aa55b1cc80ed Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 13:46:21 -0500 Subject: [PATCH 13/19] added clock gater to floating point divider to speed up simulation time. --- wally-pipelined/src/fpu/fpu.sv | 9 ++++- wally-pipelined/src/generic/clockgater.sv | 46 +++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 wally-pipelined/src/generic/clockgater.sv diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv index c876b313..8362dbe3 100755 --- a/wally-pipelined/src/fpu/fpu.sv +++ b/wally-pipelined/src/fpu/fpu.sv @@ -275,7 +275,14 @@ module fpu ( fma1 fma1 (.*); //first and only instance of floating-point divider - fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .*); + logic fpdivClk; + + clockgater fpdivclkg(.E(FDivStartE), + .SE(DivBusyM), + .CLK(clk), + .ECLK(fpdivClk)); + + fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk)); //first of two-stage instance of floating-point add/cvt unit fpuaddcvt1 fpadd1 (.*); diff --git a/wally-pipelined/src/generic/clockgater.sv b/wally-pipelined/src/generic/clockgater.sv new file mode 100644 index 00000000..dc51829d --- /dev/null +++ b/wally-pipelined/src/generic/clockgater.sv @@ -0,0 +1,46 @@ +/////////////////////////////////////////// +// clockgater.sv +// +// Written: Ross Thompson 9 January 2021 +// Modified: +// +// Purpose: Clock gater model. Must use standard cell for synthesis. +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" + +module clockgater + (input logic E, + input logic SE, + input logic CLK, + output logic ECLK); + + // VERY IMPORTANT. + // This part functionally models a clock gater, but does not necessarily meet the timing constrains a real standard cell would. + // Do not use this in synthesis! + + logic enable_q; + + + always @(E or SE) begin + enable_q <= E | SE; + end + assign ECLK = enable_q & CLK; + +endmodule From 2eeb12c6741659ece6bc5e17d13ccbd04dbfc6c5 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Tue, 1 Jun 2021 15:31:07 -0400 Subject: [PATCH 14/19] Updates to muldiv.sv for 32-bit div/rem --- wally-pipelined/config/rv64ic/wally-config.vh | 2 +- wally-pipelined/src/muldiv/muldiv.sv | 2 +- wally-pipelined/testbench/testbench-imperas.sv | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/wally-pipelined/config/rv64ic/wally-config.vh b/wally-pipelined/config/rv64ic/wally-config.vh index 259e41ae..12d254ba 100644 --- a/wally-pipelined/config/rv64ic/wally-config.vh +++ b/wally-pipelined/config/rv64ic/wally-config.vh @@ -31,7 +31,7 @@ `define XLEN 64 // MISA RISC-V configuration per specification -`define MISA (32'h00000104 | 0 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0) +`define MISA (32'h00000104 | 0 << 5 | 0 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0) `define A_SUPPORTED ((`MISA >> 0) % 2 == 1) `define C_SUPPORTED ((`MISA >> 2) % 2 == 1) `define D_SUPPORTED ((`MISA >> 3) % 2 == 1) diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv index 0c26a5df..e10b0c55 100644 --- a/wally-pipelined/src/muldiv/muldiv.sv +++ b/wally-pipelined/src/muldiv/muldiv.sv @@ -76,7 +76,7 @@ module muldiv ( assign Den0 = W64E ? {{32{SrcBE[31]&signedDivide}}, SrcBE[31:0]} : SrcBE; end else begin // RV32 has no W-type instructions assign Num0 = SrcAE; - assign Den0 = SrcAE; + assign Den0 = SrcBE; end // capture the Numerator/Denominator diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index c60aa40d..dabc6d12 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -320,11 +320,11 @@ string tests32f[] = '{ "rv32m/I-MUL-01", "2000", "rv32m/I-MULH-01", "2000", "rv32m/I-MULHSU-01", "2000", - "rv32m/I-MULHU-01", "2000" - //"rv32m/I-DIV-01", "2000", - //"rv32m/I-DIVU-01", "2000", - //"rv32m/I-REM-01", "2000", - //"rv32m/I-REMU-01", "2000" + "rv32m/I-MULHU-01", "2000", + "rv32m/I-DIV-01", "2000", + "rv32m/I-DIVU-01", "2000", + "rv32m/I-REM-01", "2000", + "rv32m/I-REMU-01", "2000" }; string tests32ic[] = '{ From 564d7c4adb0d62d3626e23e250aafa4b3db93bd4 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Tue, 1 Jun 2021 15:45:32 -0400 Subject: [PATCH 15/19] Minor cosmetic update to fpu.sv --- wally-pipelined/src/fpu/fpu.sv | 958 +++++++++++++++------------------ 1 file changed, 439 insertions(+), 519 deletions(-) diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv index 8362dbe3..e303f205 100755 --- a/wally-pipelined/src/fpu/fpu.sv +++ b/wally-pipelined/src/fpu/fpu.sv @@ -25,535 +25,455 @@ `include "wally-config.vh" module fpu ( - input logic [2:0] FRM_REGW, // Rounding mode from CSR - input logic reset, + input logic [2:0] FRM_REGW, // Rounding mode from CSR + input logic reset, //input logic clear, // *** not being used anywhere - input logic clk, - input logic [31:0] InstrD, - input logic [`XLEN-1:0] SrcAE, // Integer input being processed - input logic [`XLEN-1:0] SrcAM, // Integer input being written into fpreg - input logic StallE, StallM, StallW, - input logic FlushE, FlushM, FlushW, - input logic [`AHBW-1:0] HRDATA, - input logic RegWriteD, - output logic [4:0] SetFflagsM, - output logic [31:0] FSROutW, - output logic [1:0] FMemRWM, - output logic FStallD, - output logic FWriteIntE, FWriteIntM, FWriteIntW, + input logic clk, + input logic [31:0] InstrD, + input logic [`XLEN-1:0] SrcAE, // Integer input being processed + input logic [`XLEN-1:0] SrcAM, // Integer input being written into fpreg + input logic StallE, StallM, StallW, + input logic FlushE, FlushM, FlushW, + input logic [`AHBW-1:0] HRDATA, + input logic RegWriteD, + output logic [4:0] SetFflagsM, + output logic [31:0] FSROutW, + output logic [1:0] FMemRWM, + output logic FStallD, + output logic FWriteIntE, FWriteIntM, FWriteIntW, output logic [`XLEN-1:0] FWriteDataM, - output logic FDivSqrtDoneM, - output logic IllegalFPUInstrD, + output logic FDivSqrtDoneM, + output logic IllegalFPUInstrD, output logic [`XLEN-1:0] FPUResultW); - - - - - //control logic signal instantiation - logic FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW; // FP register write enable - logic [2:0] FrmD, FrmE, FrmM, FrmW; // FP rounding mode - logic FmtD, FmtE, FmtM, FmtW; // FP precision 0-single 1-double - logic FDivStartD, FDivStartE; // Start division - logic FWriteIntD; // Write to integer register - logic FOutputInput2D, FOutputInput2E; // Put Input2 in Input1 if a store instruction - logic [1:0] FMemRWD, FMemRWE; // Read and write enable for memory - logic [1:0] FForwardInput1D, FForwardInput1E; // Input1 forwarding mux control signal - logic [1:0] FForwardInput2D, FForwardInput2E; // Input2 forwarding mux control signal - logic FForwardInput3D, FForwardInput3E; // Input3 forwarding mux control signal - logic FInput2UsedD; // Is input 2 used - logic FInput3UsedD; // Is input 3 used - logic [2:0] FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select FP result - logic [3:0] FOpCtrlD, FOpCtrlE, FOpCtrlM; // Select which opperation to do in each component - - // regfile signals - logic [4:0] RdE, RdM, RdW; // ***Can take from ieu - logic [`XLEN-1:0] FWDM; // Write data for FP register - logic [`XLEN-1:0] FRD1D, FRD2D, FRD3D; // Read Data from FP register - logic [`XLEN-1:0] FRD1E, FRD2E, FRD3E; - logic [`XLEN-1:0] FInput1E, FInput1M, FInput1tmpE; - logic [`XLEN-1:0] FInput2E, FInput2M; - logic [`XLEN-1:0] FInput3E, FInput3M; - logic [`XLEN-1:0] FLoadStoreResultM, FLoadStoreResultW; // Result for load, store, and move to int-reg instructions - - // div/sqrt signals - logic DivDenormM, DivDenormW; - logic DivOvEn, DivUnEn; - logic DivBusyM; - logic [63:0] FDivResultM, FDivResultW; - logic [4:0] FDivFlagsM, FDivFlagsW; - - // FMA signals - logic [12:0] aligncntE, aligncntM; - logic [105:0] rE, rM; - logic [105:0] sE, sM; - logic [163:0] tE, tM; - logic [8:0] normcntE, normcntM; - logic [12:0] aeE, aeM; - logic bsE, bsM; - logic killprodE, killprodM; - logic prodofE, prodofM; - logic xzeroE, xzeroM; - logic yzeroE, yzeroM; - logic zzeroE, zzeroM; - logic xdenormE, xdenormM; - logic ydenormE, ydenormM; - logic zdenormE, zdenormM; - logic xinfE, xinfM; - logic yinfE, yinfM; - logic zinfE, zinfM; - logic xnanE, xnanM; - logic ynanE, ynanM; - logic znanE, znanM; - logic nanE, nanM; - logic [8:0] sumshiftE, sumshiftM; - logic sumshiftzeroE, sumshiftzeroM; - logic prodinfE, prodinfM; - logic [63:0] FmaResultM, FmaResultW; - logic [4:0] FmaFlagsM, FmaFlagsW; - - // add/cvt signals - logic [63:0] AddSumE, AddSumTcE; - logic [3:0] AddSelInvE; - logic [10:0] AddExpPostSumE; - logic AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE; - logic AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE; - logic AddConvertE; - logic [63:0] AddFloat1E, AddFloat2E; - logic [11:0] AddExp1DenormE, AddExp2DenormE; - logic [10:0] AddExponentE; - logic [2:0] AddRmE; - logic [3:0] AddOpTypeE; - logic AddPE, AddOvEnE, AddUnEnE; - logic AddDenormM; - logic [63:0] AddSumM, AddSumTcM; - logic [3:0] AddSelInvM; - logic [10:0] AddExpPostSumM; - logic AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM; - logic AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM; - logic AddConvertM, AddSignM; - logic [63:0] AddFloat1M, AddFloat2M; - logic [11:0] AddExp1DenormM, AddExp2DenormM; - logic [10:0] AddExponentM; - logic [63:0] AddOp1M, AddOp2M; - logic [2:0] AddRmM; - logic [3:0] AddOpTypeM; - logic AddPM, AddOvEnM, AddUnEnM; - logic [63:0] FAddResultM, FAddResultW; - logic [4:0] FAddFlagsM, FAddFlagsW; - - //cmp signals - logic [7:0] WE, WM; - logic [7:0] XE, XM; - logic ANaNE, ANaNM; - logic BNaNE, BNaNM; - logic AzeroE, AzeroM; - logic BzeroE, BzeroM; - logic CmpInvalidM, CmpInvalidW; - logic [1:0] CmpFCCM, CmpFCCW; - logic [63:0] FCmpResultM, FCmpResultW; - - // fsgn signals - logic [63:0] SgnResultE, SgnResultM, SgnResultW; - logic [4:0] SgnFlagsE, SgnFlagsM, SgnFlagsW; - - //instantiation of W stage regfile signals - logic [`XLEN-1:0] SrcAW; - - // classify signals - logic [63:0] ClassResultE, ClassResultM, ClassResultW; - - // other - logic [63:0] FPUResult64W, FPUResult64E; // 64-bit FPU result - logic [4:0] FPUFlagsW; - - // pipeline control logic - logic PipeEnableDE; - logic PipeEnableEM; - logic PipeEnableMW; - logic PipeClearDE; - logic PipeClearEM; - logic PipeClearMW; - - //temporarily assign pipe clear and enable signals - //to never flush & always be running - localparam PipeClear = 1'b0; - localparam PipeEnable = 1'b1; - always_comb begin - - PipeEnableDE = ~StallE; - PipeEnableEM = ~StallM; - PipeEnableMW = ~StallW; - PipeClearDE = FlushE; - PipeClearEM = FlushM; - PipeClearMW = FlushW; - - end - - - - - - - - - - - - - - //DECODE STAGE - - //Hazard unit for FPU - fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*); - - //top-level controller for FPU - fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*); - - - //regfile instantiation + // control logic signal instantiation + logic FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW; // FP register write enable + logic [2:0] FrmD, FrmE, FrmM, FrmW; // FP rounding mode + logic FmtD, FmtE, FmtM, FmtW; // FP precision 0-single 1-double + logic FDivStartD, FDivStartE; // Start division + logic FWriteIntD; // Write to integer register + logic FOutputInput2D, FOutputInput2E; // Put Input2 in Input1 if a store instruction + logic [1:0] FMemRWD, FMemRWE; // Read and write enable for memory + logic [1:0] FForwardInput1D, FForwardInput1E; // Input1 forwarding mux control signal + logic [1:0] FForwardInput2D, FForwardInput2E; // Input2 forwarding mux control signal + logic FForwardInput3D, FForwardInput3E; // Input3 forwarding mux control signal + logic FInput2UsedD; // Is input 2 used + logic FInput3UsedD; // Is input 3 used + logic [2:0] FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select FP result + logic [3:0] FOpCtrlD, FOpCtrlE, FOpCtrlM; // Select which opperation to do in each component + + // regfile signals + logic [4:0] RdE, RdM, RdW; // ***Can take from ieu + logic [`XLEN-1:0] FWDM; // Write data for FP register + logic [`XLEN-1:0] FRD1D, FRD2D, FRD3D; // Read Data from FP register + logic [`XLEN-1:0] FRD1E, FRD2E, FRD3E; + logic [`XLEN-1:0] FInput1E, FInput1M, FInput1tmpE; + logic [`XLEN-1:0] FInput2E, FInput2M; + logic [`XLEN-1:0] FInput3E, FInput3M; + logic [`XLEN-1:0] FLoadStoreResultM, FLoadStoreResultW; // Result for load, store, and move to int-reg instructions + + // div/sqrt signals + logic DivDenormM, DivDenormW; + logic DivOvEn, DivUnEn; + logic DivBusyM; + logic [63:0] FDivResultM, FDivResultW; + logic [4:0] FDivFlagsM, FDivFlagsW; + + // FMA signals + logic [12:0] aligncntE, aligncntM; + logic [105:0] rE, rM; + logic [105:0] sE, sM; + logic [163:0] tE, tM; + logic [8:0] normcntE, normcntM; + logic [12:0] aeE, aeM; + logic bsE, bsM; + logic killprodE, killprodM; + logic prodofE, prodofM; + logic xzeroE, xzeroM; + logic yzeroE, yzeroM; + logic zzeroE, zzeroM; + logic xdenormE, xdenormM; + logic ydenormE, ydenormM; + logic zdenormE, zdenormM; + logic xinfE, xinfM; + logic yinfE, yinfM; + logic zinfE, zinfM; + logic xnanE, xnanM; + logic ynanE, ynanM; + logic znanE, znanM; + logic nanE, nanM; + logic [8:0] sumshiftE, sumshiftM; + logic sumshiftzeroE, sumshiftzeroM; + logic prodinfE, prodinfM; + logic [63:0] FmaResultM, FmaResultW; + logic [4:0] FmaFlagsM, FmaFlagsW; + + // add/cvt signals + logic [63:0] AddSumE, AddSumTcE; + logic [3:0] AddSelInvE; + logic [10:0] AddExpPostSumE; + logic AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE; + logic AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE; + logic AddConvertE; + logic [63:0] AddFloat1E, AddFloat2E; + logic [11:0] AddExp1DenormE, AddExp2DenormE; + logic [10:0] AddExponentE; + logic [2:0] AddRmE; + logic [3:0] AddOpTypeE; + logic AddPE, AddOvEnE, AddUnEnE; + logic AddDenormM; + logic [63:0] AddSumM, AddSumTcM; + logic [3:0] AddSelInvM; + logic [10:0] AddExpPostSumM; + logic AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM; + logic AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM; + logic AddConvertM, AddSignM; + logic [63:0] AddFloat1M, AddFloat2M; + logic [11:0] AddExp1DenormM, AddExp2DenormM; + logic [10:0] AddExponentM; + logic [63:0] AddOp1M, AddOp2M; + logic [2:0] AddRmM; + logic [3:0] AddOpTypeM; + logic AddPM, AddOvEnM, AddUnEnM; + logic [63:0] FAddResultM, FAddResultW; + logic [4:0] FAddFlagsM, FAddFlagsW; + + // cmp signals + logic [7:0] WE, WM; + logic [7:0] XE, XM; + logic ANaNE, ANaNM; + logic BNaNE, BNaNM; + logic AzeroE, AzeroM; + logic BzeroE, BzeroM; + logic CmpInvalidM, CmpInvalidW; + logic [1:0] CmpFCCM, CmpFCCW; + logic [63:0] FCmpResultM, FCmpResultW; + + // fsgn signals + logic [63:0] SgnResultE, SgnResultM, SgnResultW; + logic [4:0] SgnFlagsE, SgnFlagsM, SgnFlagsW; + + // instantiation of W stage regfile signals + logic [`XLEN-1:0] SrcAW; + + // classify signals + logic [63:0] ClassResultE, ClassResultM, ClassResultW; + + // 64-bit FPU result + logic [63:0] FPUResult64W, FPUResult64E; + logic [4:0] FPUFlagsW; + + // pipeline control logic + logic PipeEnableDE; + logic PipeEnableEM; + logic PipeEnableMW; + logic PipeClearDE; + logic PipeClearEM; + logic PipeClearMW; + + // temporarily assign pipe clear and enable signals + // to never flush & always be running + localparam PipeClear = 1'b0; + localparam PipeEnable = 1'b1; + always_comb begin + PipeEnableDE = ~StallE; + PipeEnableEM = ~StallM; + PipeEnableMW = ~StallW; + PipeClearDE = FlushE; + PipeClearEM = FlushM; + PipeClearMW = FlushW; + end + + //DECODE STAGE + + // Hazard unit for FPU + fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*); + + // top-level controller for FPU + fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*); + + // regfile instantiation FPregfile fpregfile (clk, reset, FWriteEnW, InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW, FPUResult64W, FRD1D, FRD2D, FRD3D); - - - - - - - - - - //***************** - //fpregfile D/E pipe registers - //***************** - flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E); - flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E); - flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E); - - //***************** - //other D/E pipe registers - //***************** - flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE); - flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE); - flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE); - flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE); - flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE); - flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE); - flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE); - flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E); - flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E); - flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E); - flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E); - flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE); - flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E); - flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE); - - - - - - - - - - - - - - //EXECUTION STAGE - - - - // input muxs for forwarding - mux4 #(64) FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, SrcAM, FForwardInput1E, FInput1tmpE); - mux3 #(64) FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E); - mux2 #(64) FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E); - mux2 #(64) FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E); - - fma1 fma1 (.*); - - //first and only instance of floating-point divider - logic fpdivClk; - - clockgater fpdivclkg(.E(FDivStartE), - .SE(DivBusyM), - .CLK(clk), - .ECLK(fpdivClk)); - - fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk)); - - //first of two-stage instance of floating-point add/cvt unit - fpuaddcvt1 fpadd1 (.*); - - //first of two-stage instance of floating-point comparator - fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]); - - //first and only instance of floating-point sign converter - fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*); - - //first and only instance of floating-point classify unit - fpuclassify fpuclass (.*); - - - - - - - - - - - - - - - - - //***************** - //fpregfile D/E pipe registers - //***************** - flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M); - flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M); - flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M); - - //***************** - //fma E/M pipe registers - //***************** - flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); - flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); - flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); - flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); - flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); - flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM); - flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); - flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); - flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); - flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); - flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); - flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); - flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); - flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); - flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); - flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); - flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); - flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); - flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); - flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); - flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); - flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); - flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); - flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); - flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); - - //***************** - //fpadd E/M pipe registers - //***************** - flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); - flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); - flopenrc #(4) EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); - flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); - flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); - flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); - flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); - flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); - flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); - flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); - flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); - flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); - flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); - flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); - flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); - flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); - flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); - flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); - flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); - flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); - flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); - flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); - flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); - flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); - flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); - - //***************** - //fpcmp E/M pipe registers - //***************** - flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); - flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); - flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); - flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); - flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); - flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); - - //put this in for the event we want to delay fsgn - will otherwise bypass - //***************** - //fpsgn E/M pipe registers - //***************** - flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM); - flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM); - - //***************** - //other E/M pipe registers - //***************** - flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM); - flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM); - flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM); - flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM); - flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM); - flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM); - flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM); - flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM); - - //***************** - //fpuclassify E/M pipe registers - //***************** - flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM); - - - - - - - - - //BEGIN MEMORY STAGE - - assign FWriteDataM = FInput1M; - - mux2 #(64) FLoadStoreResultMux(HRDATA, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM); - - fma2 fma2(.*); - - //second instance of two-stage floating-point add/cvt unit - fpuaddcvt2 fpadd2 (.*); - - //second instance of two-stage floating-point comparator - fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*); - - - - - - - - - - - - //***************** - //fma M/W pipe registers - //***************** - flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); - flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); - - //***************** - //fpdiv M/W pipe registers - //***************** - flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); - flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW); - flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); - - //***************** - //fpadd M/W pipe registers - //***************** - flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); - flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); - - //***************** - //fpcmp M/W pipe registers - //***************** - flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); - flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); - flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); - - //***************** - //fpsgn M/W pipe registers - //***************** - flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW); - flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW); - - //***************** - //other M/W pipe registers - //***************** - flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW); - flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW); - flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW); - flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW); - flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW); - flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW); - flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW); - - - //***************** - //fpuclassify M/W pipe registers - //***************** - flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW); - - - - - - - + + //***************** + // fpregfile D/E pipe registers + //***************** + flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E); + flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E); + flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E); + + //***************** + // other D/E pipe registers + //***************** + flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE); + flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE); + flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE); + flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE); + flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE); + flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE); + flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE); + flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E); + flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E); + flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E); + flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E); + flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE); + flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E); + flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE); + + //EXECUTION STAGE + + // input muxs for forwarding + mux4 #(64) FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, SrcAM, FForwardInput1E, FInput1tmpE); + mux3 #(64) FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E); + mux2 #(64) FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E); + mux2 #(64) FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E); + + fma1 fma1 (.*); + + // first and only instance of floating-point divider + logic fpdivClk; + + clockgater fpdivclkg(.E(FDivStartE), + .SE(DivBusyM), + .CLK(clk), + .ECLK(fpdivClk)); + + fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk)); + + // first of two-stage instance of floating-point add/cvt unit + fpuaddcvt1 fpadd1 (.*); + + // first of two-stage instance of floating-point comparator + fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]); + + // first and only instance of floating-point sign converter + fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*); + + // first and only instance of floating-point classify unit + fpuclassify fpuclass (.*); + + //***************** + //fpregfile D/E pipe registers + //***************** + flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M); + flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M); + flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M); + + //***************** + // fma E/M pipe registers + //***************** + flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); + flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); + flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); + flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); + flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); + flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM); + flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); + flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); + flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); + flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); + flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); + flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); + flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); + flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); + flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); + flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); + flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); + flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); + flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); + flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); + flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); + flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); + flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); + flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); + flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); + + //***************** + // fpadd E/M pipe registers + //***************** + flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); + flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); + flopenrc #(4) EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); + flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); + flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); + flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); + flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); + flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); + flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); + flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); + flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); + flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); + flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); + flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); + flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); + flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); + flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); + flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); + flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); + flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); + flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); + flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); + flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); + flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); + flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); + + //***************** + // fpcmp E/M pipe registers + //***************** + flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); + flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); + flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); + flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); + flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); + flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); + + // put this in for the event we want to delay fsgn - will otherwise bypass + //***************** + // fpsgn E/M pipe registers + //***************** + flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM); + flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM); + + //***************** + // other E/M pipe registers + //***************** + flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM); + flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM); + flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM); + flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM); + flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM); + flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM); + flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM); + flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM); + + //***************** + // fpuclassify E/M pipe registers + //***************** + flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM); + + //BEGIN MEMORY STAGE + + assign FWriteDataM = FInput1M; + + mux2 #(64) FLoadStoreResultMux(HRDATA, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM); + + fma2 fma2(.*); + + // second instance of two-stage floating-point add/cvt unit + fpuaddcvt2 fpadd2 (.*); + + // second instance of two-stage floating-point comparator + fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), + .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*); + + //***************** + // fma M/W pipe registers + //***************** + flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); + flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); + + //***************** + // fpdiv M/W pipe registers + //***************** + flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); + flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW); + flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); + + //***************** + // fpadd M/W pipe registers + //***************** + flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); + flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); + + //***************** + // fpcmp M/W pipe registers + //***************** + flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); + flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); + flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); + + //***************** + // fpsgn M/W pipe registers + //***************** + flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW); + flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW); + + //***************** + // other M/W pipe registers + //***************** + flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW); + flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW); + flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW); + flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW); + flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW); + flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW); + flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW); + + //***************** + // fpuclassify M/W pipe registers + //***************** + flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW); //######################################### - //BEGIN WRITEBACK STAGE + // BEGIN WRITEBACK STAGE //######################################### - - always_comb begin - case (FResultSelW) - // div/sqrt - 3'b000 : FPUFlagsW = FDivFlagsW; - // cmp - 3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0}; - //fma/mult - 3'b010 : FPUFlagsW = FmaFlagsW; - // sgn inj - 3'b011 : FPUFlagsW = SgnFlagsW; - // add/sub/cnvt - 3'b100 : FPUFlagsW = FAddFlagsW; - // classify - 3'b101 : FPUFlagsW = 5'b0; - // output SrcAW - 3'b110 : FPUFlagsW = 5'b0; - // output FRD1 - 3'b111 : FPUFlagsW = 5'b0; - default : FPUFlagsW = 5'bxxxxx; - endcase - end - - - always_comb begin - case (FResultSelW) - // div/sqrt - 3'b000 : FPUResult64W = FDivResultW; - // cmp - 3'b001 : FPUResult64W = FCmpResultW; - //fma/mult - 3'b010 : FPUResult64W = FmaResultW; - // sgn inj - 3'b011 : FPUResult64W = SgnResultW; - // add/sub/cnvt - 3'b100 : FPUResult64W = FAddResultW; - // classify - 3'b101 : FPUResult64W = ClassResultW; - // output SrcAW - 3'b110 : FPUResult64W = SrcAW; - // Load/Store/Move to FP-register - 3'b111 : FPUResult64W = FLoadStoreResultW; - default : FPUResult64W = {64{1'bx}}; - endcase - end - //interface between XLEN size datapath and double-precision sized - //floating-point results - // - //define offsets for LSB zero extension or truncation - always_comb begin - - //zero extension + + always_comb begin + case (FResultSelW) + // div/sqrt + 3'b000 : FPUFlagsW = FDivFlagsW; + // cmp + 3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0}; + //fma/mult + 3'b010 : FPUFlagsW = FmaFlagsW; + // sgn inj + 3'b011 : FPUFlagsW = SgnFlagsW; + // add/sub/cnvt + 3'b100 : FPUFlagsW = FAddFlagsW; + // classify + 3'b101 : FPUFlagsW = 5'b0; + // output SrcAW + 3'b110 : FPUFlagsW = 5'b0; + // output FRD1 + 3'b111 : FPUFlagsW = 5'b0; + default : FPUFlagsW = 5'bxxxxx; + endcase + end + + always_comb begin + case (FResultSelW) + // div/sqrt + 3'b000 : FPUResult64W = FDivResultW; + // cmp + 3'b001 : FPUResult64W = FCmpResultW; + //fma/mult + 3'b010 : FPUResult64W = FmaResultW; + // sgn inj + 3'b011 : FPUResult64W = SgnResultW; + // add/sub/cnvt + 3'b100 : FPUResult64W = FAddResultW; + // classify + 3'b101 : FPUResult64W = ClassResultW; + // output SrcAW + 3'b110 : FPUResult64W = SrcAW; + // Load/Store/Move to FP-register + 3'b111 : FPUResult64W = FLoadStoreResultW; + default : FPUResult64W = {64{1'bx}}; + endcase + end // always_comb + + // interface between XLEN size datapath and double-precision sized + // floating-point results + // + // define offsets for LSB zero extension or truncation + always_comb begin + // zero extension FPUResultW = FPUResult64W[63:64-`XLEN]; - SetFflagsM = FPUFlagsW; + SetFflagsM = FPUFlagsW; + end + +endmodule // fpu - end -endmodule From 0670c57fd2638defa89c97712dfaedaed5ddf3c9 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 15:05:22 -0500 Subject: [PATCH 16/19] The clock gater was not implemented correctly. Now it is level sensitive to a low clock. --- wally-pipelined/src/generic/clockgater.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wally-pipelined/src/generic/clockgater.sv b/wally-pipelined/src/generic/clockgater.sv index dc51829d..c06a1cbd 100644 --- a/wally-pipelined/src/generic/clockgater.sv +++ b/wally-pipelined/src/generic/clockgater.sv @@ -38,7 +38,7 @@ module clockgater logic enable_q; - always @(E or SE) begin + always @(~CLK) begin enable_q <= E | SE; end assign ECLK = enable_q & CLK; From eba7ce64f56fc49a2fb4017290af23ac9a820712 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Tue, 1 Jun 2021 17:39:54 -0400 Subject: [PATCH 17/19] delete div.bak --- wally-pipelined/src/muldiv/div.bak | 1560 ---------------------------- 1 file changed, 1560 deletions(-) delete mode 100755 wally-pipelined/src/muldiv/div.bak diff --git a/wally-pipelined/src/muldiv/div.bak b/wally-pipelined/src/muldiv/div.bak deleted file mode 100755 index 4266ae61..00000000 --- a/wally-pipelined/src/muldiv/div.bak +++ /dev/null @@ -1,1560 +0,0 @@ -/////////////////////////////////////////// -// mul.sv -// -// Written: James.Stine@okstate.edu 1 February 2021 -// Modified: -// -// Purpose: Integer Divide instructions -// -// A component of the Wally configurable RISC-V project. -// -// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, -// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software -// is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT -// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -/////////////////////////////////////////// - -// *** I added these verilator controls to clean up the -// lint output. The linter warnings should be fixed, but now the output is at -// least readable. -/* verilator lint_off COMBDLY */ -/* verilator lint_off IMPLICIT */ - -`include "wally-config.vh" - -module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); - - input logic [63:0] N, D; - input logic clk; - input logic reset; - input logic start; - input logic S; - - output logic [63:0] Qf; - output logic [63:0] remf; - output logic div0; - output logic done; - output logic divBusy; - - logic divdone; - logic enable; - logic state0; - logic V; - logic [7:0] Num; - logic [5:0] P, NumIter, RemShift; - logic [63:0] op1, op2, op1shift, Rem5; - logic [64:0] Qd, Rd, Qd2, Rd2; - logic [63:0] Q, rem0; - logic [3:0] quotient; - logic otfzero; - logic shiftResult; - logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; - - logic [63:0] twoD; - logic [63:0] twoN; - logic SignD; - logic SignN; - logic [63:0] QT, remT; - logic D_NegOne; - logic Max_N; - - // Check if negative (two's complement) - // If so, convert to positive - adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD); - adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN); - assign SignD = D[63]; - assign SignN = N[63]; - // Max N and D = -1 (Overflow) - assign Max_N = (~|N[62:0]) & N[63]; - assign D_NegOne = &D; - - // Divider goes the distance to 37 cycles - // (thanks to the evil divisor for D = 0x1) - - // Shift D, if needed (for integer) - // needed to allow qst to be in range for integer - // division [1,2) and allow integer divide to work. - // - // The V or valid bit can be used to determine if D - // is 0 and thus a divide by 0 exception. This div0 - // exception is given to FSM to tell the operation to - // quit gracefully. - - lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD)); - shift_left #(64) p2 (twoD, P, op2); - assign op1 = twoN; - assign div0 = ~V; - - // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0) - // v = 2 since \rho < 1 (add 4 to make sure its a ceil) - adder #(8) cpa3 ({2'b0, P}, - {5'h0, shiftResult, ~shiftResult, 1'b0}, - Num); - - // Determine whether need to add just Q/Rem - assign shiftResult = P[0]; - // div by 2 (ceil) - assign NumIter = Num[6:1]; - assign RemShift = P; - - // FSM to control integer divider - // assume inputs are postive edge and - // datapath (divider) is negative edge - fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv, - start, div0, NumIter, ~clk, reset); - - flopr #(1) rega (~clk, reset, donev, done); - flopr #(1) regb (~clk, reset, divdonev, divdone); - flopr #(1) regc (~clk, reset, otfzerov, otfzero); - flopr #(1) regd (~clk, reset, enablev, enable); - flopr #(1) rege (~clk, reset, state0v, state0); - flopr #(1) regf (~clk, reset, divBusyv, divBusy); - - // To obtain a correct remainder the last bit of the - // quotient has to be aligned with a radix-r boundary. - // Since the quotient is in the range 1/2 < q < 2 (one - // integer bit and m fractional bits), this is achieved by - // shifting N right by v+s so that (m+v+s) mod k = 0. And, - // the quotient has to be aligned to the integer position. - - divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, - enable, otfzero, shiftResult); - - // Storage registers to hold contents stable - flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2); - flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2); - - // Probably not needed - just assigns results - assign Q = Qd2[63:0]; - assign Rem5 = Rd2[64:1]; - - // Adjust remainder by m - shift_right #(64) p4 (Rem5, RemShift, rem0); - - // Adjust Q/Rem for Signed - assign tcQ = (SignN ^ SignD) & S; - assign tcR = SignN & S; - // Signed Divide - // - When N and D are negative: Remainder is negative (undergoes a two's complement). - // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement). - // - When D is negative: Quotient is negative (undergoes a two's complement). - adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT); - adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT); - - // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec) - exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf); - -endmodule // int32div - -module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, - enable, otfzero, shiftResult); - - input logic [63:0] op1, op2; - input logic clk, state0; - input logic reset; - input logic enable; - input logic otfzero; - input logic shiftResult; - - output logic [64:0] rem0; - output logic [64:0] Q; - output logic [3:0] quotient; - - logic [67:0] Sum, Carry; - logic [64:0] Qstar; - logic [64:0] QMstar; - logic [7:0] qtotal; - logic [67:0] SumN, CarryN, SumN2, CarryN2; - logic [67:0] divi1, divi2, divi1c, divi2c, dive1; - logic [67:0] mdivi_temp, mdivi; - logic zero; - logic [1:0] qsel; - logic [1:0] Qin, QMin; - logic CshiftQ, CshiftQM; - logic [67:0] rem1, rem2, rem3; - logic [67:0] SumR, CarryR; - logic [64:0] Qt; - - // Create one's complement values of Divisor (for q*D) - assign divi1 = {3'h0, op2, 1'b0}; - assign divi2 = {2'h0, op2, 2'b0}; - assign divi1c = ~divi1; - assign divi2c = ~divi2; - // Shift x1 if not mod k - mux2 #(68) mx1 ({3'b000, op1, 1'b0}, {4'h0, op1}, shiftResult, dive1); - - // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D) - mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN); - mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN); - // Simplify QST - adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal); - // q = {+2, +1, -1, -2} else q = 0 - qst4 pd1 (qtotal[7:1], divi1[63:61], quotient); - assign ulp = quotient[2]|quotient[3]; - assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]); - // Map to binary encoding - assign qsel[1] = quotient[3]|quotient[2]; - assign qsel[0] = quotient[3]|quotient[1]; - mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp); - mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi); - csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry); - // regs : save CSA - flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2); - flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2); - // OTF - ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM); - otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, - otfzero, enable, Qstar, QMstar); - - // Correction and generation of Remainder - adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1); - // Add back +D as correction - csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR); - adder #(68) cpa3 (SumR, CarryR, rem2); - // Choose remainder (Rem or Rem+D) - mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3); - // Choose correct Q or QM - mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt); - // Final results - assign rem0 = rem3[64:0]; - assign Q = Qt; - -endmodule // divide4x64 - -module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM); - - input logic [3:0] quot; - - output logic [1:0] Qin; - output logic [1:0] QMin; - output logic CshiftQ; - output logic CshiftQM; - - // Load/Store Control for OTF - assign Qin[1] = (quot[1]) | (quot[3]) | (quot[0]); - assign Qin[0] = (quot[1]) | (quot[2]); - assign QMin[1] = (quot[1]) | (!quot[3]&!quot[2]&!quot[1]&!quot[0]); - assign QMin[0] = (quot[3]) | (quot[0]) | - (!quot[3]&!quot[2]&!quot[1]&!quot[0]); - assign CshiftQ = (quot[1]) | (quot[0]); - assign CshiftQM = (quot[3]) | (quot[2]); - -endmodule - -// On-the-fly Conversion per Ercegovac/Lang - -module otf #(parameter WIDTH=8) - (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q); - - input logic [1:0] Qin, QMin; - input logic CshiftQ, CshiftQM; - input logic clk; - input logic reset; - input logic enable; - - output logic [WIDTH-1:0] R2Q; - output logic [WIDTH-1:0] R1Q; - - logic [WIDTH-1:0] Qstar, QMstar; - logic [WIDTH-1:0] M1Q, M2Q; - - // QM - mux2 #(WIDTH) m1 (QMstar, Qstar, CshiftQM, M1Q); - flopenr #(WIDTH) r1 (clk, reset, enable, {M1Q[WIDTH-3:0], QMin}, R1Q); - // Q - mux2 #(WIDTH) m2 (Qstar, QMstar, CshiftQ, M2Q); - flopenr #(WIDTH) r2 (clk, reset, enable, {M2Q[WIDTH-3:0], Qin}, R2Q); - - assign Qstar = R2Q; - assign QMstar = R1Q; - -endmodule // otf8 - -module adder #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, - output logic [WIDTH-1:0] y); - - assign y = a + b; - -endmodule // adder - -module fa (input logic a, b, c, output logic sum, carry); - - assign sum = a^b^c; - assign carry = a&b|a&c|b&c; - -endmodule // fa - -module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c, - output logic [WIDTH-1:0] sum, carry); - - logic [WIDTH:0] carry_temp; - genvar i; - generate - for (i=0;i B. LT and GT are both '0' if A = B. - -module magcompare2b (LT, GT, A, B); - - input logic [1:0] A; - input logic [1:0] B; - - output logic LT; - output logic GT; - - // Determine if A < B using a minimized sum-of-products expression - assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0]; - // Determine if A > B using a minimized sum-of-products expression - assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0]; - -endmodule // magcompare2b - -// J. E. Stine and M. J. Schulte, "A combined two's complement and -// floating-point comparator," 2005 IEEE International Symposium on -// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. -// doi: 10.1109/ISCAS.2005.1464531 - -module magcompare8 (LT, EQ, A, B); - - input logic [7:0] A; - input logic [7:0] B; - - logic [3:0] s; - logic [3:0] t; - logic [1:0] u; - logic [1:0] v; - logic GT; - //wire LT; - - output logic EQ; - output logic LT; - - magcompare2b mag1 (s[0], t[0], A[1:0], B[1:0]); - magcompare2b mag2 (s[1], t[1], A[3:2], B[3:2]); - magcompare2b mag3 (s[2], t[2], A[5:4], B[5:4]); - magcompare2b mag4 (s[3], t[3], A[7:6], B[7:6]); - - magcompare2b mag5 (u[0], v[0], t[1:0], s[1:0]); - magcompare2b mag6 (u[1], v[1], t[3:2], s[3:2]); - - magcompare2b mag7 (LT, GT, v[1:0], u[1:0]); - - assign EQ = ~(GT | LT); - -endmodule // magcompare8 - -module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); - - input logic [63:0] Q; - input logic [63:0] rem; - input logic [63:0] op1; - input logic S; - input logic div0; - input logic Max_N; - input logic D_NegOne; - - output logic [63:0] Qf; - output logic [63:0] remf; - - // Needs to be optimized - always_comb - case ({div0, S, Max_N, D_NegOne}) - 4'b0000 : Qf = Q; - 4'b0001 : Qf = Q; - 4'b0010 : Qf = Q; - 4'b0011 : Qf = Q; - 4'b0100 : Qf = Q; - 4'b0101 : Qf = Q; - 4'b0110 : Qf = Q; - 4'b0111 : Qf = {1'b1, 31'h0}; - 4'b1000 : Qf = {64{1'b1}}; - 4'b1001 : Qf = {64{1'b1}}; - 4'b1010 : Qf = {64{1'b1}}; - 4'b1011 : Qf = {64{1'b1}}; - 4'b1100 : Qf = {64{1'b1}}; - 4'b1101 : Qf = {64{1'b1}}; - 4'b1110 : Qf = {64{1'b1}}; - 4'b1111 : Qf = {64{1'b1}}; - default: Qf = Q; - endcase - - always_comb - case ({div0, S, Max_N, D_NegOne}) - 4'b0000 : remf = rem; - 4'b0001 : remf = rem; - 4'b0010 : remf = rem; - 4'b0011 : remf = rem; - 4'b0100 : remf = rem; - 4'b0101 : remf = rem; - 4'b0110 : remf = rem; - 4'b0111 : remf = 64'h0; - 4'b1000 : remf = op1; - 4'b1001 : remf = op1; - 4'b1010 : remf = op1; - 4'b1011 : remf = op1; - 4'b1100 : remf = op1; - 4'b1101 : remf = op1; - 4'b1110 : remf = op1; - 4'b1111 : remf = op1; - default: remf = rem; - endcase - -endmodule // exception_int - -/* verilator lint_on COMBDLY */ -/* verilator lint_on IMPLICIT */ - From 40cfa8693564e3f74a5f1a0cd437d0d0ff03d577 Mon Sep 17 00:00:00 2001 From: Kip Macsai-Goren Date: Tue, 1 Jun 2021 17:49:45 -0400 Subject: [PATCH 18/19] Edited and added constants to support SV48 --- .../config/buildroot/wally-constants.vh | 26 +++++++++++++------ .../config/busybear/wally-constants.vh | 26 +++++++++++++------ .../config/coremark/wally-constants.vh | 26 +++++++++++++------ .../config/coremark_bare/wally-constants.vh | 26 +++++++++++++------ .../config/rv32ic/wally-constants.vh | 12 ++++++++- .../config/rv64BP/wally-constants.vh | 26 +++++++++++++------ .../config/rv64ic/wally-constants.vh | 26 +++++++++++++------ .../config/rv64icfd/wally-constants.vh | 26 +++++++++++++------ .../config/rv64imc/wally-constants.vh | 26 +++++++++++++------ 9 files changed, 155 insertions(+), 65 deletions(-) diff --git a/wally-pipelined/config/buildroot/wally-constants.vh b/wally-pipelined/config/buildroot/wally-constants.vh index 43d95863..cc6c27fc 100644 --- a/wally-pipelined/config/buildroot/wally-constants.vh +++ b/wally-pipelined/config/buildroot/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/busybear/wally-constants.vh b/wally-pipelined/config/busybear/wally-constants.vh index 43d95863..cc6c27fc 100644 --- a/wally-pipelined/config/busybear/wally-constants.vh +++ b/wally-pipelined/config/busybear/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/coremark/wally-constants.vh b/wally-pipelined/config/coremark/wally-constants.vh index 43d95863..cc6c27fc 100644 --- a/wally-pipelined/config/coremark/wally-constants.vh +++ b/wally-pipelined/config/coremark/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/coremark_bare/wally-constants.vh b/wally-pipelined/config/coremark_bare/wally-constants.vh index 43d95863..cc6c27fc 100644 --- a/wally-pipelined/config/coremark_bare/wally-constants.vh +++ b/wally-pipelined/config/coremark_bare/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/rv32ic/wally-constants.vh b/wally-pipelined/config/rv32ic/wally-constants.vh index ec4a48b4..f4c5ce9a 100644 --- a/wally-pipelined/config/rv32ic/wally-constants.vh +++ b/wally-pipelined/config/rv32ic/wally-constants.vh @@ -2,7 +2,10 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 31 May 2021 +// added svmode constants. These aren't strictly necessary since we're just checking one bit, +// but they're here to stay consistent and to make sure we dont wind up +// a "NO_TRANSLATE undefined" situation. // // Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. // These macros should not be changed, except in the event of an @@ -31,3 +34,10 @@ `define PPN_BITS 22 `define PPN_HIGH_SEGMENT_BITS 12 `define PA_BITS 34 +`define SVMODE_BITS 1 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 // These two are only here to stop +`define SV48 9 // the verilator from yelling at me diff --git a/wally-pipelined/config/rv64BP/wally-constants.vh b/wally-pipelined/config/rv64BP/wally-constants.vh index 43d95863..cc6c27fc 100644 --- a/wally-pipelined/config/rv64BP/wally-constants.vh +++ b/wally-pipelined/config/rv64BP/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/rv64ic/wally-constants.vh b/wally-pipelined/config/rv64ic/wally-constants.vh index 43d95863..cc6c27fc 100644 --- a/wally-pipelined/config/rv64ic/wally-constants.vh +++ b/wally-pipelined/config/rv64ic/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/rv64icfd/wally-constants.vh b/wally-pipelined/config/rv64icfd/wally-constants.vh index 43d95863..cc6c27fc 100644 --- a/wally-pipelined/config/rv64icfd/wally-constants.vh +++ b/wally-pipelined/config/rv64icfd/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/rv64imc/wally-constants.vh b/wally-pipelined/config/rv64imc/wally-constants.vh index 43d95863..cc6c27fc 100644 --- a/wally-pipelined/config/rv64imc/wally-constants.vh +++ b/wally-pipelined/config/rv64imc/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 From 5187574e8a45a3a993e88aafb0fda0b372c82e1f Mon Sep 17 00:00:00 2001 From: Kip Macsai-Goren Date: Tue, 1 Jun 2021 17:50:37 -0400 Subject: [PATCH 19/19] implemented Sv48. --- wally-pipelined/src/mmu/cam_line.sv | 20 +++-- wally-pipelined/src/mmu/page_number_mixer.sv | 87 +++++++++++++++----- wally-pipelined/src/mmu/pagetablewalker.sv | 60 ++++++++++---- wally-pipelined/src/mmu/tlb.sv | 33 +++++--- wally-pipelined/src/mmu/tlb_cam.sv | 25 +++--- 5 files changed, 160 insertions(+), 65 deletions(-) diff --git a/wally-pipelined/src/mmu/cam_line.sv b/wally-pipelined/src/mmu/cam_line.sv index b7577573..6bab0b60 100644 --- a/wally-pipelined/src/mmu/cam_line.sv +++ b/wally-pipelined/src/mmu/cam_line.sv @@ -2,7 +2,9 @@ // cam_line.sv // // Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// Implemented SV48 on top of SV39. This included adding SvMode input signal and the wally constants +// Mostly this was done to make the PageNumberMixer work. // // Purpose: CAM line for the translation lookaside buffer (TLB) // Determines whether a virtual address matches the stored key. @@ -24,12 +26,17 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// +`include "wally-constants.vh" + module cam_line #(parameter KEY_BITS = 20, parameter HIGH_SEGMENT_BITS = 10) ( input clk, reset, + // input to scheck which SvMode is running + input [`SVMODE_BITS-1:0] SvMode, + // The requested page number to compare against the key - input [KEY_BITS-1:0] VirtualPageNumber, + input [KEY_BITS-1:0] VirtualPageNumber, // Signals to write a new entry to this line input CAMLineWrite, @@ -38,10 +45,11 @@ module cam_line #(parameter KEY_BITS = 20, // Flush this line (set valid to 0) input TLBFlush, - // This entry is a key for a giga, mega, or kilopage. + // This entry is a key for a tera, giga, mega, or kilopage. // PageType == 2'b00 --> kilopage // PageType == 2'b01 --> megapage - // PageType == 2'b11 --> gigapage + // PageType == 2'b10 --> gigapage + // PageType == 2'b11 --> terapage output [1:0] PageType, // *** should this be the stored version or the always updated one? output Match ); @@ -67,9 +75,9 @@ module cam_line #(parameter KEY_BITS = 20, flopenr #(KEY_BITS) keyflop(clk, reset, CAMLineWrite, VirtualPageNumber, Key); // Calculate the actual query key based on the input key and the page type. - // For example, a megapage in sv39 only cares about VPN2 and VPN1, so VPN0 + // For example, a megapage in SV39 only cares about VPN2 and VPN1, so VPN0 // should automatically match. - page_number_mixer #(KEY_BITS, HIGH_SEGMENT_BITS) mixer(VirtualPageNumber, Key, PageType, VirtualPageNumberQuery); + page_number_mixer #(KEY_BITS, HIGH_SEGMENT_BITS) mixer(VirtualPageNumber, Key, PageType, SvMode, VirtualPageNumberQuery); assign Match = ({1'b1, VirtualPageNumberQuery} == {Valid, Key}); diff --git a/wally-pipelined/src/mmu/page_number_mixer.sv b/wally-pipelined/src/mmu/page_number_mixer.sv index 57b8e4b7..03851018 100644 --- a/wally-pipelined/src/mmu/page_number_mixer.sv +++ b/wally-pipelined/src/mmu/page_number_mixer.sv @@ -2,7 +2,11 @@ // page_number_mixer.sv // // Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// Implemented SV48 on top of SV39. This included adding a 3rd Segment to each of the pagenumbers, +// Ensuring that the BITS and HIGH_SEGMENT_BITS inputs were correct everywhere this module gets instatniated, +// Adding seveeral muxes to decide the bit selection to turn pagenumbers into segments based on SV mode, +// Adding support for terapage/newgigapage encoding. // // Purpose: Takes two page numbers and replaces segments of the first page // number with segments from the second, based on the page type. @@ -25,22 +29,29 @@ /////////////////////////////////////////// `include "wally-config.vh" +`include "wally-constants.vh" module page_number_mixer #(parameter BITS = 20, parameter HIGH_SEGMENT_BITS = 10) ( - input [BITS-1:0] PageNumber, - input [BITS-1:0] MixPageNumber, - input [1:0] PageType, - output [BITS-1:0] PageNumberCombined + input [BITS-1:0] PageNumber, + input [BITS-1:0] MixPageNumber, + input [1:0] PageType, + input [`SVMODE_BITS-1:0] SvMode, + + output [BITS-1:0] PageNumberCombined ); + // The upper segment might have a different width than the lower segments. + // For example, an SV39 PTE has 26 bits for PPN2 and 9 bits for the other + // segments. This is outside the 'if XLEN' b/c the constant is already configured + // to the correct value for the XLEN in the relevant wally-constants.vh file. + localparam LOW_SEGMENT_BITS = `VPN_SEGMENT_BITS; + // *** each time this module is implemented, low segment bits is either + // `VPN_SEGMENT_BITS or `PPN_LOW_SEGMENT_BITS (if it existed) + // in every mode so far, these are the same, so it's left as it is above. + generate - // *** Just checking XLEN is not enough to support sv39 AND sv48. if (`XLEN == 32) begin - // The upper segment might have a different width than the lower segments. - // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other - // segments. - localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS); logic [HIGH_SEGMENT_BITS-1:0] Segment1, MixSegment1, Segment1Combined; logic [LOW_SEGMENT_BITS-1:0] Segment0, MixSegment0, Segment0Combined; @@ -58,28 +69,60 @@ module page_number_mixer #(parameter BITS = 20, // Reswizzle segments of the combined page number assign PageNumberCombined = {Segment1Combined, Segment0Combined}; end else begin - // The upper segment might have a different width than the lower segments. - // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other - // segments. - localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS) / 2; - logic [HIGH_SEGMENT_BITS-1:0] Segment2, MixSegment2, Segment2Combined; + // After segment 0 and 1 of the page number, the width of each segment is dependant on the SvMode. + // For this reason, each segment bus is the width of its widest value across each mode + // when a smaller value needs to be loaded in to a wider bus, it's loaded in the least significant bits + // and left padded with zeros. MAKE SURE that if a value is being padded with zeros here, + // that it's padded with zeros everywhere else in the MMU ans beyond to avoid false misses in the TLB. + logic [HIGH_SEGMENT_BITS-1:0] Segment3, MixSegment3, Segment3Combined; + logic [HIGH_SEGMENT_BITS + LOW_SEGMENT_BITS-1:0] Segment2, MixSegment2, Segment2Combined; logic [LOW_SEGMENT_BITS-1:0] Segment1, MixSegment1, Segment1Combined; logic [LOW_SEGMENT_BITS-1:0] Segment0, MixSegment0, Segment0Combined; + // Unswizzle segments of the input page number - assign {Segment2, Segment1, Segment0} = PageNumber; - assign {MixSegment2, MixSegment1, MixSegment0} = MixPageNumber; + // *** these muxes assume that only Sv48 and SV39 are implemented in rv64. for future SV57 and up, + // there will have to be more muxes to select which value each segment gets. + // as a cool reminder: BITS is the width of the page number, virt or phys, coming into this module + // while high segment bits is the width of the highest segment of that page number. + // Note for future work: this module has to work with both VPNs and PPNs and due to their differing + // widths and the fact that the ppn has one longer segment at the top makes the muxes below very confusing. + // Potentially very annoying thing for future workers: the number of bits in a ppn is always 44 (for SV39 and48) + // but in SV57 and above, this might be a new longer length. In that case these selectors will most likely + // become even more complicated and confusing. + assign Segment3 = (SvMode == `SV48) ? + PageNumber[BITS-1:3*LOW_SEGMENT_BITS] : // take the top segment or not + {HIGH_SEGMENT_BITS{1'b0}}; // for virtual page numbers in SV39, both options should be zeros. + assign Segment2 = (SvMode == `SV48) ? + {{HIGH_SEGMENT_BITS{1'b0}}, PageNumber[3*LOW_SEGMENT_BITS-1:2*LOW_SEGMENT_BITS]} : // just take another low segment left padded with zeros. + PageNumber[BITS-1:2*LOW_SEGMENT_BITS]; // otherwise take the rest of the PageNumber + assign Segment1 = PageNumber[2*LOW_SEGMENT_BITS-1:LOW_SEGMENT_BITS]; + assign Segment0 = PageNumber[LOW_SEGMENT_BITS-1:0]; + + + assign MixSegment3 = (SvMode == `SV48) ? + MixPageNumber[BITS-1:3*LOW_SEGMENT_BITS] : // take the top segment or not + {HIGH_SEGMENT_BITS{1'b0}}; // for virtual page numbers in SV39, both options should be zeros. + assign MixSegment2 = (SvMode == `SV48) ? + {{HIGH_SEGMENT_BITS{1'b0}}, MixPageNumber[3*LOW_SEGMENT_BITS-1:2*LOW_SEGMENT_BITS]} : // just take another low segment left padded with zeros. + MixPageNumber[BITS-1:2*LOW_SEGMENT_BITS]; // otherwise take the rest of the PageNumber + assign MixSegment1 = MixPageNumber[2*LOW_SEGMENT_BITS-1:LOW_SEGMENT_BITS]; + assign MixSegment0 = MixPageNumber[LOW_SEGMENT_BITS-1:0]; + // Pass through the high segment - assign Segment2Combined = Segment2; + assign Segment3Combined = Segment3; - // Either pass through or zero out segments 1 and 0 based on the page type - mux2 #(LOW_SEGMENT_BITS) segment1mux(Segment1, MixSegment1, PageType[1], Segment1Combined); - mux2 #(LOW_SEGMENT_BITS) segment0mux(Segment0, MixSegment0, PageType[0], Segment0Combined); + // Either pass through or zero out lower segments based on the page type + assign Segment2Combined = (PageType[1] && PageType[0]) ? MixSegment2 : Segment2; // terapage (page == 11) + assign Segment1Combined = (PageType[1]) ? MixSegment1 : Segment1; // gigapage and higher (page == 10 or 11) + assign Segment0Combined = (PageType[1] || PageType[0]) ? MixSegment0 : Segment0; // megapage and higher (page == 01 or 10 or 11) // Reswizzle segments of the combined page number - assign PageNumberCombined = {Segment2Combined, Segment1Combined, Segment0Combined}; + assign PageNumberCombined = (SvMode == `SV48) ? + {Segment3Combined, Segment2Combined[LOW_SEGMENT_BITS-1:0], Segment1Combined, Segment0Combined} : + {Segment2Combined, Segment1Combined, Segment0Combined}; end endgenerate endmodule diff --git a/wally-pipelined/src/mmu/pagetablewalker.sv b/wally-pipelined/src/mmu/pagetablewalker.sv index f2aada44..b0e4fe8e 100644 --- a/wally-pipelined/src/mmu/pagetablewalker.sv +++ b/wally-pipelined/src/mmu/pagetablewalker.sv @@ -2,7 +2,10 @@ // pagetablewalker.sv // // Written: tfleming@hmc.edu 2 March 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// implemented SV48 on top of SV39. This included, adding a level of the FSM for the extra page number segment +// adding support for terapage encoding, and for setting the TranslationPAdr using the new level, +// adding the internal SvMode signal // // Purpose: Page Table Walker // Part of the Memory Management Unit (MMU) @@ -70,6 +73,7 @@ module pagetablewalker ( logic [`XLEN-1:0] SavedPTE, CurrentPTE; logic [`PA_BITS-1:0] TranslationPAdr; logic [`PPN_BITS-1:0] CurrentPPN; + logic [`SVMODE_BITS-1:0] SvMode; logic MemStore; // PTE Control Bits @@ -82,6 +86,8 @@ module pagetablewalker ( logic [`XLEN-1:0] PageTableEntry; logic [1:0] PageType; + assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS]; + assign BasePageTablePPN = SATP_REGW[`PPN_BITS-1:0]; assign MemStore = MemRWM[0]; @@ -105,11 +111,12 @@ module pagetablewalker ( assign PageTypeF = PageType; assign PageTypeM = PageType; - localparam IDLE = 3'h0; + localparam LEVEL0 = 3'h0; localparam LEVEL1 = 3'h1; - localparam LEVEL0 = 3'h2; - localparam LEAF = 3'h3; - localparam FAULT = 3'h4; + // space left for more levels + localparam LEAF = 3'h5; + localparam IDLE = 3'h6; + localparam FAULT = 3'h7; logic [2:0] WalkerState, NextWalkerState; @@ -208,18 +215,32 @@ module pagetablewalker ( assign MMUPAdr = TranslationPAdr[31:0]; end else begin - localparam LEVEL2 = 3'h5; + localparam LEVEL2 = 3'h2; + localparam LEVEL3 = 3'h3; - logic [8:0] VPN2, VPN1, VPN0; + logic [8:0] VPN3, VPN2, VPN1, VPN0; - logic GigapageMisaligned, BadGigapage; + logic TerapageMisaligned, GigapageMisaligned, BadTerapage, BadGigapage; flopenl #(3) mmureg(HCLK, ~HRESETn, 1'b1, NextWalkerState, IDLE, WalkerState); always_comb begin case (WalkerState) - IDLE: if (MMUTranslate) NextWalkerState = LEVEL2; + IDLE: if (MMUTranslate) NextWalkerState = LEVEL3; else NextWalkerState = IDLE; + LEVEL3: if (SvMode != `SV48) NextWalkerState = LEVEL2; + // 3rd level used if SV48 is enabled. + else begin + if (~MMUReady) NextWalkerState = LEVEL3; + // *** According to the architecture, we should + // fault upon finding a superpage that is misaligned or has 0 + // access bit. The following commented line of code is + // supposed to perform that check. However, it is untested. + else if (ValidPTE && LeafPTE && ~BadTerapage) NextWalkerState = LEAF; + // else if (ValidPTE && LeafPTE) NextWalkerState = LEAF; // *** Once the above line is properly tested, delete this line. + else if (ValidPTE && ~LeafPTE) NextWalkerState = LEVEL2; + else NextWalkerState = FAULT; + end LEVEL2: if (~MMUReady) NextWalkerState = LEVEL2; // *** According to the architecture, we should // fault upon finding a superpage that is misaligned or has 0 @@ -242,24 +263,29 @@ module pagetablewalker ( else if (ValidPTE && LeafPTE && ~AccessAlert) NextWalkerState = LEAF; else NextWalkerState = FAULT; - LEAF: if (MMUTranslate) NextWalkerState = LEVEL2; + LEAF: if (MMUTranslate) NextWalkerState = LEVEL3; else NextWalkerState = IDLE; - FAULT: if (MMUTranslate) NextWalkerState = LEVEL2; + FAULT: if (MMUTranslate) NextWalkerState = LEVEL3; else NextWalkerState = IDLE; // Default case should never happen, but is included for linter. default: NextWalkerState = IDLE; endcase end + // A terapage is a level 3 leaf page. This page must have zero PPN[2], + // zero PPN[1], and zero PPN[0] + assign TerapageMisaligned = |(CurrentPPN[26:0]); // A gigapage is a Level 2 leaf page. This page must have zero PPN[1] and // zero PPN[0] assign GigapageMisaligned = |(CurrentPPN[17:0]); // A megapage is a Level 1 leaf page. This page must have zero PPN[0]. assign MegapageMisaligned = |(CurrentPPN[8:0]); + assign BadTerapage = TerapageMisaligned || AccessAlert; // *** Implement better access/dirty scheme assign BadGigapage = GigapageMisaligned || AccessAlert; // *** Implement better access/dirty scheme assign BadMegapage = MegapageMisaligned || AccessAlert; // *** Implement better access/dirty scheme + assign VPN3 = TranslationVAdr[47:39]; assign VPN2 = TranslationVAdr[38:30]; assign VPN1 = TranslationVAdr[29:21]; assign VPN0 = TranslationVAdr[20:12]; @@ -282,8 +308,13 @@ module pagetablewalker ( IDLE: begin MMUStall = '0; end + LEVEL3: begin + TranslationPAdr = {BasePageTablePPN, VPN3, 3'b000}; + // *** this is a huge breaking point. if we're going through level3 every time, even when sv48 is off, + // what should translationPAdr be when level3 is just off? + end LEVEL2: begin - TranslationPAdr = {BasePageTablePPN, VPN2, 3'b000}; + TranslationPAdr = {(SvMode == `SV48) ? CurrentPPN : BasePageTablePPN, VPN2, 3'b000}; end LEVEL1: begin TranslationPAdr = {CurrentPPN, VPN1, 3'b000}; @@ -295,8 +326,9 @@ module pagetablewalker ( // Keep physical address alive to prevent HADDR dropping to 0 TranslationPAdr = {CurrentPPN, VPN0, 3'b000}; PageTableEntry = CurrentPTE; - PageType = (WalkerState == LEVEL2) ? 2'b11 : - ((WalkerState == LEVEL1) ? 2'b01 : 2'b00); + PageType = (WalkerState == LEVEL3) ? 2'b11 : + ((WalkerState == LEVEL2) ? 2'b10 : + ((WalkerState == LEVEL1) ? 2'b01 : 2'b00)); DTLBWriteM = DTLBMissM; ITLBWriteF = ~DTLBMissM; // Prefer data over instructions end diff --git a/wally-pipelined/src/mmu/tlb.sv b/wally-pipelined/src/mmu/tlb.sv index 7ed594e4..1828c98e 100644 --- a/wally-pipelined/src/mmu/tlb.sv +++ b/wally-pipelined/src/mmu/tlb.sv @@ -2,7 +2,9 @@ // tlb.sv // // Written: jtorrey@hmc.edu 16 February 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// Implemented SV48 on top of SV39. This included adding the SvMode signal, +// and using it to decide the translate signal and get the virtual page number // // Purpose: Translation lookaside buffer // Cache of virtural-to-physical address translations @@ -25,7 +27,7 @@ /////////////////////////////////////////// /** - * sv32 specs + * SV32 specs * ---------- * Virtual address [31:0] (32 bits) * [________________________________] @@ -85,14 +87,11 @@ module tlb #(parameter ENTRY_BITS = 3, output TLBPageFault ); - logic SvMode; logic Translate; logic TLBAccess, ReadAccess, WriteAccess; - // *** If we want to support multiple virtual memory modes (ie sv39 AND sv48), - // we could have some muxes that control which parameters are current. - // Although then some of the signals are not big enough. But that's a problem - // for much later. + // Store current virtual memory mode (SV32, SV39, SV48, ect...) + logic [`SVMODE_BITS-1:0] SvMode; // Index (currently random) to write the next TLB entry logic [ENTRY_BITS-1:0] WriteIndex; @@ -116,17 +115,24 @@ module tlb #(parameter ENTRY_BITS = 3, // Whether the virtual address has a match in the CAM logic CAMHit; - // Grab the sv bit from SATP + // Grab the sv mode from SATP + assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS]; + + // The bus width is always the largest it could be for that XLEN. For example, vpn will be 36 bits wide in rv64 + // this, even though it could be 27 bits (SV39) or 36 bits (SV48) wide. When the value of VPN is narrower, + // is shorter, the extra bits are used as padded zeros on the left of the full value. generate if (`XLEN == 32) begin - assign SvMode = SATP_REGW[31]; // *** change to an enum somehow? + assign VirtualPageNumber = VirtualAddress[`VPN_BITS+11:12]; end else begin - assign SvMode = SATP_REGW[63]; // currently just a boolean whether translation enabled + assign VirtualPageNumber = (SvMode == `SV48) ? + VirtualAddress[`VPN_BITS+11:12] : + {{`VPN_SEGMENT_BITS{1'b0}}, VirtualAddress[3*`VPN_SEGMENT_BITS+11:12]}; end endgenerate // Whether translation should occur - assign Translate = SvMode & (PrivilegeModeW != `M_MODE); + assign Translate = (SvMode != `NO_TRANSLATE) & (PrivilegeModeW != `M_MODE); // Determine how the TLB is currently being used // Note that we use ReadAccess for both loads and instruction fetches @@ -134,7 +140,7 @@ module tlb #(parameter ENTRY_BITS = 3, assign WriteAccess = TLBAccessType[0]; assign TLBAccess = ReadAccess || WriteAccess; - assign VirtualPageNumber = VirtualAddress[`VPN_BITS+11:12]; + assign PageOffset = VirtualAddress[11:0]; // TLB entries are evicted according to the LRU algorithm @@ -188,9 +194,10 @@ module tlb #(parameter ENTRY_BITS = 3, // page number. For 4 KB pages, the entire virtual page number is replaced. // For superpages, some segments are considered offsets into a larger page. page_number_mixer #(`PPN_BITS, `PPN_HIGH_SEGMENT_BITS) - physical_mixer(PhysicalPageNumber, + physical_mixer(PhysicalPageNumber, {{EXTRA_PHYSICAL_BITS{1'b0}}, VirtualPageNumber}, HitPageType, + SvMode, PhysicalPageNumberMixed); // Provide physical address only on TLBHits to cause catastrophic errors if diff --git a/wally-pipelined/src/mmu/tlb_cam.sv b/wally-pipelined/src/mmu/tlb_cam.sv index 330bb382..78d9ff8d 100644 --- a/wally-pipelined/src/mmu/tlb_cam.sv +++ b/wally-pipelined/src/mmu/tlb_cam.sv @@ -2,7 +2,9 @@ // tlb_cam.sv // // Written: jtorrey@hmc.edu 16 February 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// Implemented SV48 on top of SV39. This included adding the SvMode signal input and wally constants +// Mostly this was to make the cam_lines work. // // Purpose: Stores virtual page numbers with cached translations. // Determines whether a given virtual page number is in the TLB. @@ -24,18 +26,21 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// +`include "wally-constants.vh" + module tlb_cam #(parameter ENTRY_BITS = 3, parameter KEY_BITS = 20, parameter HIGH_SEGMENT_BITS = 10) ( - input clk, reset, - input [KEY_BITS-1:0] VirtualPageNumber, - input [1:0] PageTypeWrite, - input [ENTRY_BITS-1:0] WriteIndex, - input TLBWrite, - input TLBFlush, - output [ENTRY_BITS-1:0] VPNIndex, - output [1:0] HitPageType, - output CAMHit + input clk, reset, + input [KEY_BITS-1:0] VirtualPageNumber, + input [1:0] PageTypeWrite, + input [ENTRY_BITS-1:0] WriteIndex, + input [`SVMODE_BITS-1:0] SvMode, + input TLBWrite, + input TLBFlush, + output [ENTRY_BITS-1:0] VPNIndex, + output [1:0] HitPageType, + output CAMHit ); localparam NENTRIES = 2**ENTRY_BITS;