From 735e5110735ae0717255a8ee9fcd00f9a2e19acd Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 25 May 2021 14:26:22 -0500 Subject: [PATCH 01/19] fixed bug with icache miss spill fsm branch. --- .../src/ifu/globalHistoryPredictor.sv | 78 +++++-------------- wally-pipelined/src/ifu/icache.sv | 22 ++++-- 2 files changed, 34 insertions(+), 66 deletions(-) diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv index 087458df3..b2357ecce 100644 --- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv +++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv @@ -32,76 +32,34 @@ module globalHistoryPredictor ) (input logic clk, input logic reset, - input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, + input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, input logic [`XLEN-1:0] LookUpPC, output logic [1:0] Prediction, // update input logic [`XLEN-1:0] UpdatePC, input logic UpdateEN, PCSrcE, input logic [1:0] UpdatePrediction - + ); - logic [k-1:0] GHRF, GHRFNext; - assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; + logic [k-1:0] GHRF, GHRFNext; + assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; - flopenr #(k) GlobalHistoryRegister(.clk(clk), - .reset(reset), - .en(UpdateEN), - .d(GHRFNext), - .q(GHRF)); - - - - logic [1:0] PredictionMemory; - logic DoForwarding, DoForwardingF; - logic [1:0] UpdatePredictionF; - + flopenr #(k) GlobalHistoryRegister(.clk(clk), + .reset(reset), + .en(UpdateEN), + .d(GHRFNext), + .q(GHRF)); // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT - // GHR referes to the address that the past k branches points to in the prediction stage - // GHRE refers to the address that the past k branches points to in the exectution stage - SRAM2P1R1W #(k, 2) PHT(.clk(clk), - .reset(reset), - .RA1(GHRF), - .RD1(PredictionMemory), - .REN1(~StallF), - .WA1(GHRFNext), - .WD1(UpdatePrediction), - .WEN1(UpdateEN), - .BitWEN1(2'b11)); + SRAM2P1R1W #(k, 2) PHT(.clk(clk), + .reset(reset), + .RA1(GHRF), + .RD1(Prediction), + .REN1(~StallF), + .WA1(GHRF), + .WD1(UpdatePrediction), + .WEN1(UpdateEN), + .BitWEN1(2'b11)); - // need to forward when updating to the same address as reading. - // first we compare to see if the update and lookup addreses are the same - assign DoForwarding = GHRF == GHRFNext; - - // register the update value and the forwarding signal into the Fetch stage - // TODO: add stall logic *** - flopr #(1) DoForwardingReg(.clk(clk), - .reset(reset), - .d(DoForwarding), - .q(DoForwardingF)); - - flopr #(2) UpdatePredictionReg(.clk(clk), - .reset(reset), - .d(UpdatePrediction), - .q(UpdatePredictionF)); - - assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory; - - //pipeline for GHR - /*flopenrc #(k) GHRDReg(.clk(clk), - .reset(reset), - .en(~StallD), - .clear(FlushD), - .d(GHRF), - .q(GHRD)); - - flopenrc #(k) GHREReg(.clk(clk), - .reset(reset), - .en(~StallE), - .clear(FlushE), - .d(GHRD), - .q(GHRE)); -*/ endmodule diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 9e30a083a..4f51edd79 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -154,15 +154,16 @@ module icachecontroller #(parameter LINESIZE = 256) ( localparam STATE_MISS_SPILL_FETCH_DONE = 10; // write data into SRAM/LUT localparam STATE_MISS_SPILL_READ1 = 11; // read block 0 from SRAM/LUT localparam STATE_MISS_SPILL_2 = 12; // return to ready if hit or do second block update. - localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 13; // miss on block 1, issue read to AHB and wait - localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 14; // write data to SRAM/LUT - localparam STATE_MISS_SPILL_MERGE = 15; // read block 0 of CPU access, + localparam STATE_MISS_SPILL_2_START = 13; // return to ready if hit or do second block update. + localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 14; // miss on block 1, issue read to AHB and wait + localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 15; // write data to SRAM/LUT + localparam STATE_MISS_SPILL_MERGE = 16; // read block 0 of CPU access, - localparam STATE_MISS_SPILL_FINAL = 16; // this state replicates STATE_READY's replay of the + localparam STATE_MISS_SPILL_FINAL = 17; // this state replicates STATE_READY's replay of the // spill access but does nto consider spill. It also does not do another operation. - localparam STATE_INVALIDATE = 17; // *** not sure if invalidate or evict? invalidate by cache block or address? + localparam STATE_INVALIDATE = 18; // *** not sure if invalidate or evict? invalidate by cache block or address? localparam AHBByteLength = `XLEN / 8; localparam AHBOFFETWIDTH = $clog2(AHBByteLength); @@ -380,11 +381,20 @@ module icachecontroller #(parameter LINESIZE = 256) ( PCMux = 2'b10; UnalignedSelect = 1'b1; spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm. + ICacheReadEn = 1'b1; + NextState = STATE_MISS_SPILL_2_START; + end + STATE_MISS_SPILL_2_START: begin if (~hit) begin CntReset = 1'b1; NextState = STATE_MISS_SPILL_MISS_FETCH_WDV; end else begin - NextState = STATE_MISS_SPILL_FINAL; + NextState = STATE_READY; + ICacheReadEn = 1'b1; + PCMux = 2'b00; + UnalignedSelect = 1'b1; + SavePC = 1'b1; + ICacheStallF = 1'b0; end end STATE_MISS_SPILL_MISS_FETCH_WDV: begin From 668a79cf77931379c54cb7abf4f1d8306d63f473 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Thu, 27 May 2021 11:48:29 -0500 Subject: [PATCH 02/19] Updated benchmarking code. --- testsBP/crt0/Makefile | 4 ++-- testsBP/crt0/start.s | 7 +++---- testsBP/mibench_qsort/Makefile | 2 +- testsBP/sieve/Makefile | 2 +- testsBP/sieve/sieve.c | 20 ++++++++++---------- testsBP/simple/Makefile | 2 +- testsBP/simple/header.h | 1 + testsBP/simple/main.c | 1 + 8 files changed, 20 insertions(+), 19 deletions(-) diff --git a/testsBP/crt0/Makefile b/testsBP/crt0/Makefile index ab47384fc..b42e86cb8 100644 --- a/testsBP/crt0/Makefile +++ b/testsBP/crt0/Makefile @@ -9,7 +9,7 @@ MABI :=-mabi=lp64 LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles AFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -W -CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -mcmodel=medany +CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -mcmodel=medany -O2 AS=riscv64-unknown-elf-as CC=riscv64-unknown-elf-gcc AR=riscv64-unknown-elf-ar @@ -19,7 +19,7 @@ all: libcrt0.a %.o: %.s ${AS} ${AFLAGS} -c $< -o $@ -libcrt0.a: start.o +libcrt0.a: start.o pcnt_driver.o pre_main.o ${AR} -r $@ $^ clean: diff --git a/testsBP/crt0/start.s b/testsBP/crt0/start.s index 19a240d87..731a61e34 100644 --- a/testsBP/crt0/start.s +++ b/testsBP/crt0/start.s @@ -43,11 +43,10 @@ _start: - # set the stack pointer to the top of memory - # 0x8000_0000 + 64K - 8 bytes - li sp, 0x007FFFF8 + # set the stack pointer to the top of memory - 8 bytes (pointer size) + li sp, 0x07FFFFF8 - jal ra, main + jal ra, pre_main jal ra, _halt .section .text diff --git a/testsBP/mibench_qsort/Makefile b/testsBP/mibench_qsort/Makefile index f4d368392..b1cf7b679 100644 --- a/testsBP/mibench_qsort/Makefile +++ b/testsBP/mibench_qsort/Makefile @@ -8,7 +8,7 @@ MARCH :=-march=rv64ic MABI :=-mabi=lp64 LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map -CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align +CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align -O2 CC=riscv64-unknown-elf-gcc DA=riscv64-unknown-elf-objdump -d diff --git a/testsBP/sieve/Makefile b/testsBP/sieve/Makefile index 1d38d123d..9c884f48a 100644 --- a/testsBP/sieve/Makefile +++ b/testsBP/sieve/Makefile @@ -8,7 +8,7 @@ MARCH :=-march=rv64ic MABI :=-mabi=lp64 LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map -CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align +CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align -O2 CC=riscv64-unknown-elf-gcc DA=riscv64-unknown-elf-objdump -d diff --git a/testsBP/sieve/sieve.c b/testsBP/sieve/sieve.c index e82074045..f7d36d957 100644 --- a/testsBP/sieve/sieve.c +++ b/testsBP/sieve/sieve.c @@ -66,21 +66,21 @@ int main () { ans = sieve (); //gettimeofday(&after , NULL); - if (ans != 1899) - printf ("Sieve result wrong, ans = %d, expected 1899", ans); + /* /\* /\\* if (ans != 1899) *\\/ *\/ */ + /* /\* /\\* printf ("Sieve result wrong, ans = %d, expected 1899", ans); *\\/ *\/ */ - //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); + /* /\* //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); *\/ */ - printf("Round 2\n"); - //gettimeofday(&before , NULL); + /* /\* printf("Round 2\n"); *\/ */ + /* //gettimeofday(&before , NULL); */ - ans = sieve (); - //gettimeofday(&after , NULL); - if (ans != 1899) - printf ("Sieve result wrong, ans = %d, expected 1899", ans); + /* ans = sieve (); */ + /* //gettimeofday(&after , NULL); */ + /* if (ans != 1899) */ + /* printf ("Sieve result wrong, ans = %d, expected 1899", ans); */ - //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); + /* //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); */ return 0; diff --git a/testsBP/simple/Makefile b/testsBP/simple/Makefile index 450aacaa4..4447f2843 100644 --- a/testsBP/simple/Makefile +++ b/testsBP/simple/Makefile @@ -8,7 +8,7 @@ MARCH :=-march=rv64ic MABI :=-mabi=lp64 LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map -CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align +CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany -mstrict-align -O2 CC=riscv64-unknown-elf-gcc DA=riscv64-unknown-elf-objdump -d diff --git a/testsBP/simple/header.h b/testsBP/simple/header.h index bfe014a4b..6def656f8 100644 --- a/testsBP/simple/header.h +++ b/testsBP/simple/header.h @@ -5,4 +5,5 @@ int fail(); int simple_csrbr_test(); int lbu_test(); int icache_spill_test(); +void global_hist_test(); #endif diff --git a/testsBP/simple/main.c b/testsBP/simple/main.c index 0d14fcfb8..036a351d1 100644 --- a/testsBP/simple/main.c +++ b/testsBP/simple/main.c @@ -2,6 +2,7 @@ int main(){ //int res = icache_spill_test(); + global_hist_test(); int res = 1; if (res < 0) { fail(); From 40bdcda32db6e6a74954b85d1df0a2645ddf5c17 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Thu, 27 May 2021 23:06:28 -0500 Subject: [PATCH 03/19] It's a bit sloppy, but the global history predictor is working correctly now. There were two major bugs with the predictor. First the update mechanism was completely wrong. The PHT is updated with the GHR that was used to lookup the prediction. PHT[GHR] = Sat2(PHT[GHR], branch outcome). Second the GHR needs to be updated speculatively as the branch is predicted. This is important so that back to back branches' GHRs are not the same. The must be different to avoid aliasing. Speculation of the GHR update allows them to be different. On mis prediction the GHR must be reverted. This implementation is a bit sloppy with names and now the GHR recovery is performed. Updates to follow. --- wally-pipelined/config/rv64BP/wally-config.vh | 3 +- wally-pipelined/src/ifu/bpred.sv | 9 ++-- .../src/ifu/globalHistoryPredictor.sv | 47 +++++++++++++++++-- wally-pipelined/src/ifu/ifu.sv | 9 +--- 4 files changed, 51 insertions(+), 17 deletions(-) diff --git a/wally-pipelined/config/rv64BP/wally-config.vh b/wally-pipelined/config/rv64BP/wally-config.vh index 17a8c284a..fd482bfde 100644 --- a/wally-pipelined/config/rv64BP/wally-config.vh +++ b/wally-pipelined/config/rv64BP/wally-config.vh @@ -110,5 +110,6 @@ `define TWO_BIT_PRELOAD "../config/rv64icfd/twoBitPredictor.txt" `define BTB_PRELOAD "../config/rv64icfd/BTBPredictor.txt" `define BPRED_ENABLED 1 -`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE +//`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE +`define BPTYPE "BPGLOBAL" // BPTWOBIT or "BPGSHARE" or BPLOCALPAg or BPGSHARE `define TESTSBP 1 diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv index de0f8143b..c5b4dde48 100644 --- a/wally-pipelined/src/ifu/bpred.sv +++ b/wally-pipelined/src/ifu/bpred.sv @@ -30,7 +30,8 @@ module bpred (input logic clk, reset, - input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, + input logic StallF, StallD, StallE, StallM, StallW, + input logic FlushF, FlushD, FlushE, FlushM, FlushW, // Fetch stage // the prediction input logic [`XLEN-1:0] PCNextF, // *** forgot to include this one on the I/O list @@ -93,6 +94,8 @@ module bpred // update .UpdatePC(PCE), .UpdateEN(InstrClassE[0] & ~StallE), + .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF), + .BPPredDirWrongE(BPPredDirWrongE), .PCSrcE(PCSrcE), .UpdatePrediction(UpdateBPPredE)); end else if (`BPTYPE == "BPGSHARE") begin:Predictor @@ -190,14 +193,14 @@ module bpred flopenrc #(2) BPPredRegD(.clk(clk), .reset(reset), .en(~StallD), - .clear(FlushD), + .clear(1'b0), .d(BPPredF), .q(BPPredD)); flopenrc #(2) BPPredRegE(.clk(clk), .reset(reset), .en(~StallE), - .clear(FlushE), + .clear(1'b0), .d(BPPredD), .q(BPPredE)); diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv index b2357ecce..fadbf004b 100644 --- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv +++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv @@ -37,29 +37,66 @@ module globalHistoryPredictor output logic [1:0] Prediction, // update input logic [`XLEN-1:0] UpdatePC, - input logic UpdateEN, PCSrcE, + input logic UpdateEN, PCSrcE, + input logic SpeculativeUpdateEn, BPPredDirWrongE, input logic [1:0] UpdatePrediction ); - logic [k-1:0] GHRF, GHRFNext; - assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; + logic [k-1:0] GHRF, GHRFNext, GHRD, GHRE, GHRLookup; + + logic FlushedD, FlushedE; + + + // if the prediction is wrong we need to restore the ghr. + assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : + {Prediction[1], GHRF[k-1:1]}; flopenr #(k) GlobalHistoryRegister(.clk(clk), .reset(reset), - .en(UpdateEN), + .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)), .d(GHRFNext), .q(GHRF)); + // if actively updating the GHR at the time of prediction we want to us + // GHRFNext as the lookup rather than GHRF. + + assign GHRLookup = UpdateEN ? GHRFNext : GHRF; + // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT SRAM2P1R1W #(k, 2) PHT(.clk(clk), .reset(reset), .RA1(GHRF), .RD1(Prediction), .REN1(~StallF), - .WA1(GHRF), + .WA1(GHRE), .WD1(UpdatePrediction), .WEN1(UpdateEN), .BitWEN1(2'b11)); + flopenr #(k) GlobalHistoryRegisterD(.clk(clk), + .reset(reset), + .en(~StallD & ~FlushedE), + .d(GHRF), + .q(GHRD)); + + flopenr #(k) GlobalHistoryRegisterE(.clk(clk), + .reset(reset), + .en(~StallE & ~ FlushedE), + .d(GHRD), + .q(GHRE)); + + + flopenr #(1) flushedDReg(.clk(clk), + .reset(reset), + .en(~StallD), + .d(FlushD), + .q(FlushedD)); + + flopenr #(1) flushedEReg(.clk(clk), + .reset(reset), + .en(~StallE), + .d(FlushE | FlushedD), + .q(FlushedE)); + endmodule diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv index 994288bd8..0922f7877 100644 --- a/wally-pipelined/src/ifu/ifu.sv +++ b/wally-pipelined/src/ifu/ifu.sv @@ -153,14 +153,7 @@ module ifu ( generate if (`BPRED_ENABLED == 1) begin : bpred // I am making the port connection explicit for now as I want to see them and they will be changing. - bpred bpred(.clk(clk), - .reset(reset), - .StallF(StallF), - .StallD(StallD), - .StallE(StallE), - .FlushF(FlushF), - .FlushD(FlushD), - .FlushE(FlushE), + bpred bpred(.*, .PCNextF(PCNextF), .BPPredPCF(BPPredPCF), .SelBPPredF(SelBPPredF), From 529226ac8d906f60bcd5bb5e143f310471c32b25 Mon Sep 17 00:00:00 2001 From: Kip Macsai-Goren Date: Fri, 28 May 2021 18:09:28 -0400 Subject: [PATCH 04/19] made priority encoder parameterizable --- wally-pipelined/src/mmu/priority_encoder.sv | 68 ++++++++------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/wally-pipelined/src/mmu/priority_encoder.sv b/wally-pipelined/src/mmu/priority_encoder.sv index e4a62ce17..dade2e834 100644 --- a/wally-pipelined/src/mmu/priority_encoder.sv +++ b/wally-pipelined/src/mmu/priority_encoder.sv @@ -4,7 +4,11 @@ // Written: tfleming@hmc.edu & jtorrey@hmc.edu 7 April 2021 // Based on implementation from https://www.allaboutcircuits.com/ip-cores/communication-controller/priority-encoder/ // *** Give proper LGPL attribution for above source -// Modified: +// Modified: Teo Ene 15 Apr 2021: +// Temporarily removed paramterized priority encoder for non-parameterized one +// To get synthesis working quickly +// Kmacsaigoren@hmc.edu 28 May 2021: +// Added working version of parameterized priority encoder. // // Purpose: One-hot encoding to binary encoder // @@ -27,51 +31,33 @@ `include "wally-config.vh" -// Teo Ene 04/15: -// Temporarily removed paramterized priority encoder for non-parameterized one -// To get synthesis working quickly module priority_encoder #(parameter BINARY_BITS = 3) ( - input logic [7:0] one_hot, - output logic [2:0] binary + input logic [2**BINARY_BITS - 1:0] one_hot, + output logic [BINARY_BITS - 1:0] binary ); - // localparam ONE_HOT_BITS = 2**BINARY_BITS; - - /* - genvar i, j; - generate - for (i = 0; i < ONE_HOT_BITS; i++) begin - for (j = 0; j < BINARY_BITS; j++) begin - if (i[j]) begin - assign binary[j] = one_hot[i]; - end - end - end - endgenerate - */ - - /* - logic [BINARY_BITS-1:0] binary_comb; - + integer i; always_comb begin - binary_comb = 0; - for (int i = 0; i < ONE_HOT_BITS; i++) - if (one_hot[i]) binary_comb = i; + binary = 0; + for (i = 0; i < 2**BINARY_BITS; i++) begin + if (one_hot[i]) binary = i; // prioritizes the most significant bit + end end + // *** triple check synthesizability here - assign binary = binary_comb; + // Ideally this mimics the following: + /* + always_comb begin + casex (one_hot) + 1xx ... x: binary = BINARY_BITS - 1; + 01x ... x: binary = BINARY_BITS - 2; + 001 ... x: binary = BINARY_BITS - 3; + + {...} + + 00 ... 1xx: binary = 2; + 00 ... 01x: binary = 1; + 00 ... 001: binary = 0; + end */ - always_comb - case (one_hot) - 8'h1: binary=3'h0; - 8'h2: binary=3'h1; - 8'h4: binary=3'h2; - 8'h8: binary=3'h3; - 8'h10: binary=3'h4; - 8'h20: binary=3'h5; - 8'h40: binary=3'h6; - 8'h80: binary=3'h7; - default: binary=3'h0; //should never happen - endcase - endmodule From 889b93563085354846a054cb4e9dde20f9145813 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Mon, 31 May 2021 08:36:19 -0400 Subject: [PATCH 05/19] Modify elements of generics for LZD and shifter wrote for integer divider. --- wally-pipelined/src/generic/lzd.sv | 195 +++++++++++++++++++++++++++ wally-pipelined/src/generic/lzd.sv~ | 195 +++++++++++++++++++++++++++ wally-pipelined/src/generic/shift.sv | 76 +++++++++++ wally-pipelined/src/muldiv/div.sv | 146 +------------------- 4 files changed, 471 insertions(+), 141 deletions(-) create mode 100755 wally-pipelined/src/generic/lzd.sv create mode 100755 wally-pipelined/src/generic/lzd.sv~ create mode 100755 wally-pipelined/src/generic/shift.sv diff --git a/wally-pipelined/src/generic/lzd.sv b/wally-pipelined/src/generic/lzd.sv new file mode 100755 index 000000000..98642c150 --- /dev/null +++ b/wally-pipelined/src/generic/lzd.sv @@ -0,0 +1,195 @@ +/////////////////////////////////////////// +// lzd.sv +// +// Written: James.Stine@okstate.edu 1 February 2021 +// Modified: +// +// Purpose: Integer Divide instructions +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" +/* verilator lint_off DECLFILENAME */ + +// Original idea came from V. G. Oklobdzija, "An algorithmic and novel +// design of a leading zero detector circuit: comparison with logic +// synthesis," in IEEE Transactions on Very Large Scale Integration +// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi: +// 10.1109/92.273153. + +// Modified to be more hierarchical + +module lzd2 (P, V, B); + + input logic [1:0] B; + + output logic P; + output logic V; + + assign V = B[0] | B[1]; + assign P = B[0] & ~B[1]; + +endmodule // lz2 + +module lzd_hier #(parameter WIDTH=8) + (input logic [WIDTH-1:0] B, + output logic [$clog2(WIDTH)-1:0] ZP, + output logic ZV); + + if (WIDTH == 128) + lzd128 lz127 (ZP, ZV, B); + else if (WIDTH == 64) + lzd64 lz64 (ZP, ZV, B); + else if (WIDTH == 32) + lzd32 lz32 (ZP, ZV, B); + else if (WIDTH == 16) + lzd16 lz16 (ZP, ZV, B); + else if (WIDTH == 8) + lzd8 lz8 (ZP, ZV, B); + else if (WIDTH == 4) + lzd4 lz4 (ZP, ZV, B); + +endmodule // lzd_hier + +module lzd4 (ZP, ZV, B); + + input logic [3:0] B; + + logic ZPa; + logic ZPb; + logic ZVa; + logic ZVb; + + output logic [1:0] ZP; + output logic ZV; + + lz2 l1(ZPa, ZVa, B[1:0]); + lz2 l2(ZPb, ZVb, B[3:2]); + + assign ZP[0:0] = ZVb ? ZPb : ZPa; + assign ZP[1] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd4 + +module lzd8 (ZP, ZV, B); + + input logic [7:0] B; + + logic [1:0] ZPa; + logic [1:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [2:0] ZP; + output logic ZV; + + lz4 l1(ZPa, ZVa, B[3:0]); + lz4 l2(ZPb, ZVb, B[7:4]); + + assign ZP[1:0] = ZVb ? ZPb : ZPa; + assign ZP[2] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd8 + +module lzd16 (ZP, ZV, B); + + input logic [15:0] B; + + logic [2:0] ZPa; + logic [2:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [3:0] ZP; + output logic ZV; + + lz8 l1(ZPa, ZVa, B[7:0]); + lz8 l2(ZPb, ZVb, B[15:8]); + + assign ZP[2:0] = ZVb ? ZPb : ZPa; + assign ZP[3] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd16 + +module lzd32 (ZP, ZV, B); + + input logic [31:0] B; + + logic [3:0] ZPa; + logic [3:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [4:0] ZP; + output logic ZV; + + lz16 l1(ZPa, ZVa, B[15:0]); + lz16 l2(ZPb, ZVb, B[31:16]); + + assign ZP[3:0] = ZVb ? ZPb : ZPa; + assign ZP[4] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd32 + +module lzd64 (ZP, ZV, B); + + input logic [63:0] B; + + logic [4:0] ZPa; + logic [4:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [5:0] ZP; + output logic ZV; + + lz32 l1(ZPa, ZVa, B[31:0]); + lz32 l2(ZPb, ZVb, B[63:32]); + + assign ZP[4:0] = ZVb ? ZPb : ZPa; + assign ZP[5] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd64 + +module lzd128 (ZP, ZV, B); + + input logic [127:0] B; + + logic [5:0] ZPa; + logic [5:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [6:0] ZP; + output logic ZV; + + lz64 l1(ZPa, ZVa, B[64:0]); + lz64 l2(ZPb, ZVb, B[127:63]); + + assign ZP[5:0] = ZVb ? ZPb : ZPa; + assign ZP[6] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lzd128 + +/* verilator lint_on DECLFILENAME */ diff --git a/wally-pipelined/src/generic/lzd.sv~ b/wally-pipelined/src/generic/lzd.sv~ new file mode 100755 index 000000000..bfffe5e5b --- /dev/null +++ b/wally-pipelined/src/generic/lzd.sv~ @@ -0,0 +1,195 @@ +/////////////////////////////////////////// +// lzd.sv +// +// Written: James.Stine@okstate.edu 1 February 2021 +// Modified: +// +// Purpose: Integer Divide instructions +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" +/* verilator lint_off DECLFILENAME */ + +// Original idea came from V. G. Oklobdzija, "An algorithmic and novel +// design of a leading zero detector circuit: comparison with logic +// synthesis," in IEEE Transactions on Very Large Scale Integration +// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi: +// 10.1109/92.273153. + +// Modified to be more hierarchical + +module lz2 (P, V, B); + + input logic [1:0] B; + + output logic P; + output logic V; + + assign V = B[0] | B[1]; + assign P = B[0] & ~B[1]; + +endmodule // lz2 + +module lzd_hier #(parameter WIDTH=8) + (input logic [WIDTH-1:0] B, + output logic [$clog2(WIDTH)-1:0] ZP, + output logic ZV); + + if (WIDTH == 128) + lz128 lzd127 (ZP, ZV, B); + else if (WIDTH == 64) + lz64 lzd64 (ZP, ZV, B); + else if (WIDTH == 32) + lz32 lzd32 (ZP, ZV, B); + else if (WIDTH == 16) + lz16 lzd16 (ZP, ZV, B); + else if (WIDTH == 8) + lz8 lzd8 (ZP, ZV, B); + else if (WIDTH == 4) + lz4 lzd4 (ZP, ZV, B); + +endmodule // lzd_hier + +module lz4 (ZP, ZV, B); + + input logic [3:0] B; + + logic ZPa; + logic ZPb; + logic ZVa; + logic ZVb; + + output logic [1:0] ZP; + output logic ZV; + + lz2 l1(ZPa, ZVa, B[1:0]); + lz2 l2(ZPb, ZVb, B[3:2]); + + assign ZP[0:0] = ZVb ? ZPb : ZPa; + assign ZP[1] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule + +module lz8 (ZP, ZV, B); + + input logic [7:0] B; + + logic [1:0] ZPa; + logic [1:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [2:0] ZP; + output logic ZV; + + lz4 l1(ZPa, ZVa, B[3:0]); + lz4 l2(ZPb, ZVb, B[7:4]); + + assign ZP[1:0] = ZVb ? ZPb : ZPa; + assign ZP[2] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule + +module lz16 (ZP, ZV, B); + + input logic [15:0] B; + + logic [2:0] ZPa; + logic [2:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [3:0] ZP; + output logic ZV; + + lz8 l1(ZPa, ZVa, B[7:0]); + lz8 l2(ZPb, ZVb, B[15:8]); + + assign ZP[2:0] = ZVb ? ZPb : ZPa; + assign ZP[3] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lz16 + +module lz32 (ZP, ZV, B); + + input logic [31:0] B; + + logic [3:0] ZPa; + logic [3:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [4:0] ZP; + output logic ZV; + + lz16 l1(ZPa, ZVa, B[15:0]); + lz16 l2(ZPb, ZVb, B[31:16]); + + assign ZP[3:0] = ZVb ? ZPb : ZPa; + assign ZP[4] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lz32 + +module lz64 (ZP, ZV, B); + + input logic [63:0] B; + + logic [4:0] ZPa; + logic [4:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [5:0] ZP; + output logic ZV; + + lz32 l1(ZPa, ZVa, B[31:0]); + lz32 l2(ZPb, ZVb, B[63:32]); + + assign ZP[4:0] = ZVb ? ZPb : ZPa; + assign ZP[5] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lz64 + +module lz128 (ZP, ZV, B); + + input logic [127:0] B; + + logic [5:0] ZPa; + logic [5:0] ZPb; + logic ZVa; + logic ZVb; + + output logic [6:0] ZP; + output logic ZV; + + lz64 l1(ZPa, ZVa, B[64:0]); + lz64 l2(ZPb, ZVb, B[127:63]); + + assign ZP[5:0] = ZVb ? ZPb : ZPa; + assign ZP[6] = ~ZVb; + assign ZV = ZVa | ZVb; + +endmodule // lz128 + +/* verilator lint_on DECLFILENAME */ diff --git a/wally-pipelined/src/generic/shift.sv b/wally-pipelined/src/generic/shift.sv new file mode 100755 index 000000000..881525882 --- /dev/null +++ b/wally-pipelined/src/generic/shift.sv @@ -0,0 +1,76 @@ +/////////////////////////////////////////// +// shifters.sv +// +// Written: James.Stine@okstate.edu 1 February 2021 +// Modified: +// +// Purpose: Integer Divide instructions +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" +/* verilator lint_off DECLFILENAME */ +/* verilator lint_off UNOPTFLAT */ + +module shift_right #(parameter WIDTH=8) + (input logic [WIDTH-1:0] A, + input logic [$clog2(WIDTH)-1:0] Shift, + output logic [WIDTH-1:0] Z); + + logic [WIDTH-1:0] stage [$clog2(WIDTH):0]; + logic sign; + genvar i; + + assign stage[0] = A; + generate + for (i=0;i<$clog2(WIDTH);i=i+1) + begin : genbit + mux2 #(WIDTH) mux_inst (stage[i], + {{(WIDTH/(2**(i+1))){1'b0}}, stage[i][WIDTH-1:WIDTH/(2**(i+1))]}, + Shift[$clog2(WIDTH)-i-1], + stage[i+1]); + end + endgenerate + assign Z = stage[$clog2(WIDTH)]; + +endmodule // shift_right + +module shift_left #(parameter WIDTH=8) + (input logic [WIDTH-1:0] A, + input logic [$clog2(WIDTH)-1:0] Shift, + output logic [WIDTH-1:0] Z); + + logic [WIDTH-1:0] stage [$clog2(WIDTH):0]; + genvar i; + + assign stage[0] = A; + generate + for (i=0;i<$clog2(WIDTH);i=i+1) + begin : genbit + mux2 #(WIDTH) mux_inst (stage[i], + {stage[i][WIDTH-1-WIDTH/(2**(i+1)):0], {(WIDTH/(2**(i+1))){1'b0}}}, + Shift[$clog2(WIDTH)-i-1], + stage[i+1]); + end + endgenerate + assign Z = stage[$clog2(WIDTH)]; + +endmodule // shift_left + +/* verilator lint_on DECLFILENAME */ +/* verilator lint_on UNOPTFLAT */ diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv index db830ca34..4266ae61a 100755 --- a/wally-pipelined/src/muldiv/div.sv +++ b/wally-pipelined/src/muldiv/div.sv @@ -78,11 +78,7 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); assign D_NegOne = &D; // Divider goes the distance to 37 cycles - // (thanks the evil divisor for D = 0x1) - // but could theoretically be stopped when - // divdone is asserted. The enable signal - // turns off register storage thus invalidating - // any future cycles. + // (thanks to the evil divisor for D = 0x1) // Shift D, if needed (for integer) // needed to allow qst to be in range for integer @@ -93,8 +89,8 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); // exception is given to FSM to tell the operation to // quit gracefully. - lz64 p1 (P, V, twoD); - shifter_l64 p2 (op2, twoD, P); + lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD)); + shift_left #(64) p2 (twoD, P, op2); assign op1 = twoN; assign div0 = ~V; @@ -141,9 +137,8 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); assign Q = Qd2[63:0]; assign Rem5 = Rd2[64:1]; - // Adjust remainder by m (no need to adjust by - // n ln(r) - shifter_r64 p4 (rem0, Rem5, RemShift); + // Adjust remainder by m + shift_right #(64) p4 (Rem5, RemShift, rem0); // Adjust Q/Rem for Signed assign tcQ = (SignN ^ SignD) & S; @@ -368,8 +363,6 @@ module qst4 (input logic [6:0] s, input logic [2:0] d, endmodule // qst4 -// LZD - module lz2 (P, V, B0, B1); input logic B0; @@ -497,7 +490,6 @@ module lz64 (ZP, ZV, B); endmodule // lz64 // FSM Control for Integer Divider - module fsm64 (en, state0, done, divdone, otfzero, divBusy, start, error, NumIter, clk, reset); @@ -1505,134 +1497,6 @@ module magcompare8 (LT, EQ, A, B); endmodule // magcompare8 -module shifter_l64 (Z, A, Shift); - - input logic [63:0] A; - input logic [5:0] Shift; - - logic [63:0] stage1; - logic [63:0] stage2; - logic [63:0] stage3; - logic [63:0] stage4; - logic [63:0] stage5; - - output logic [63:0] Z; - - mux2 #(64) mx01(A, {A[31:0], 32'h0}, Shift[5], stage1); - mux2 #(64) mx02(stage1, {stage1[47:0], 16'h0}, Shift[4], stage2); - mux2 #(64) mx03(stage2, {stage2[55:0], 8'h0}, Shift[3], stage3); - mux2 #(64) mx04(stage3, {stage3[59:0], 4'h0}, Shift[2], stage4); - mux2 #(64) mx05(stage4, {stage4[61:0], 2'h0}, Shift[1], stage5); - mux2 #(64) mx06(stage5, {stage5[62:0], 1'h0}, Shift[0], Z); - -endmodule // shifter_l64 - -module shifter_r64 (Z, A, Shift); - - input logic [63:0] A; - input logic [5:0] Shift; - - logic [63:0] stage1; - logic [63:0] stage2; - logic [63:0] stage3; - logic [63:0] stage4; - logic [63:0] stage5; - - output logic [63:0] Z; - - mux2 #(64) mx01(A, {32'h0, A[63:32]}, Shift[5], stage1); - mux2 #(64) mx02(stage1, {16'h0, stage1[63:16]}, Shift[4], stage2); - mux2 #(64) mx03(stage2, {8'h0, stage2[63:8]}, Shift[3], stage3); - mux2 #(64) mx04(stage3, {4'h0, stage3[63:4]}, Shift[2], stage4); - mux2 #(64) mx05(stage4, {2'h0, stage4[63:2]}, Shift[1], stage5); - mux2 #(64) mx06(stage5, {1'h0, stage5[63:1]}, Shift[0], Z); - -endmodule // shifter_r64 - -module shifter_l32 (Z, A, Shift); - - input logic [31:0] A; - input logic [4:0] Shift; - - logic [31:0] stage1; - logic [31:0] stage2; - logic [31:0] stage3; - logic [31:0] stage4; - - output logic [31:0] Z; - - mux2 #(32) mx01(A, {A[15:0], 16'h0}, Shift[4], stage1); - mux2 #(32) mx02(stage1, {stage1[23:0], 8'h0}, Shift[3], stage2); - mux2 #(32) mx03(stage2, {stage2[27:0], 4'h0}, Shift[2], stage3); - mux2 #(32) mx04(stage3, {stage3[29:0], 2'h0}, Shift[1], stage4); - mux2 #(32) mx05(stage4, {stage4[30:0], 1'h0}, Shift[0], Z); - -endmodule // shifter_l32 - -module shifter_r32 (Z, A, Shift); - - input logic [31:0] A; - input logic [4:0] Shift; - - logic [31:0] stage1; - logic [31:0] stage2; - logic [31:0] stage3; - logic [31:0] stage4; - - output logic [31:0] Z; - - mux2 #(32) mx01(A, {16'h0, A[31:16]}, Shift[4], stage1); - mux2 #(32) mx02(stage1, {8'h0, stage1[31:8]}, Shift[3], stage2); - mux2 #(32) mx03(stage2, {4'h0, stage2[31:4]}, Shift[2], stage3); - mux2 #(32) mx04(stage3, {2'h0, stage3[31:2]}, Shift[1], stage4); - mux2 #(32) mx05(stage4, {1'h0, stage4[31:1]}, Shift[0], Z); - -endmodule // shifter_r32 - -module shift_right #(parameter WIDTH=8) - (input logic [`XLEN-1:0] A, - input logic [$clog2(`XLEN)-1:0] Shift, - output logic [`XLEN-1:0] Z); - - logic [`XLEN-1:0] stage [$clog2(`XLEN):0]; - genvar i; - - assign stage[0] = A; - generate - for (i=0;i<$clog2(`XLEN);i=i+1) - begin : genbit - mux2 #(`XLEN) mux_inst (stage[i], - {{(`XLEN/(2**(i+1))){1'b0}}, stage[i][`XLEN-1:`XLEN/(2**(i+1))]}, - Shift[$clog2(`XLEN)-i-1], - stage[i+1]); - end - endgenerate - assign Z = stage[$clog2(`XLEN)]; - -endmodule // shift_right - -module shift_left #(parameter WIDTH=8) - (input logic [`XLEN-1:0] A, - input logic [$clog2(`XLEN)-1:0] Shift, - output logic [`XLEN-1:0] Z); - - logic [`XLEN-1:0] stage [$clog2(`XLEN):0]; - genvar i; - - assign stage[0] = A; - generate - for (i=0;i<$clog2(`XLEN);i=i+1) - begin : genbit - mux2 #(`XLEN) mux_inst (stage[i], - {stage[i][`XLEN-1-`XLEN/(2**(i+1)):0], {(`XLEN/(2**(i+1))){1'b0}}}, - Shift[$clog2(`XLEN)-i-1], - stage[i+1]); - end - endgenerate - assign Z = stage[$clog2(`XLEN)]; - -endmodule // shift_right - module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); input logic [63:0] Q; From 2f365a9e07ef62b5f7128eb81e7e6b993bc5ae72 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Mon, 31 May 2021 09:12:21 -0400 Subject: [PATCH 06/19] Add enhancements to integer divider including: - better comments - optimize FSM to end earlier - passes for 32-bit or 64-bit depending on parameter to intdiv Left div.bak in just in case have to revert back to original for now. --- wally-pipelined/src/muldiv/div.bak | 1560 ++++++++++++++++++++++++++ wally-pipelined/src/muldiv/div.sv | 614 ++++------ wally-pipelined/src/muldiv/muldiv.sv | 3 +- 3 files changed, 1773 insertions(+), 404 deletions(-) create mode 100755 wally-pipelined/src/muldiv/div.bak diff --git a/wally-pipelined/src/muldiv/div.bak b/wally-pipelined/src/muldiv/div.bak new file mode 100755 index 000000000..4266ae61a --- /dev/null +++ b/wally-pipelined/src/muldiv/div.bak @@ -0,0 +1,1560 @@ +/////////////////////////////////////////// +// mul.sv +// +// Written: James.Stine@okstate.edu 1 February 2021 +// Modified: +// +// Purpose: Integer Divide instructions +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +// *** I added these verilator controls to clean up the +// lint output. The linter warnings should be fixed, but now the output is at +// least readable. +/* verilator lint_off COMBDLY */ +/* verilator lint_off IMPLICIT */ + +`include "wally-config.vh" + +module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); + + input logic [63:0] N, D; + input logic clk; + input logic reset; + input logic start; + input logic S; + + output logic [63:0] Qf; + output logic [63:0] remf; + output logic div0; + output logic done; + output logic divBusy; + + logic divdone; + logic enable; + logic state0; + logic V; + logic [7:0] Num; + logic [5:0] P, NumIter, RemShift; + logic [63:0] op1, op2, op1shift, Rem5; + logic [64:0] Qd, Rd, Qd2, Rd2; + logic [63:0] Q, rem0; + logic [3:0] quotient; + logic otfzero; + logic shiftResult; + logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; + + logic [63:0] twoD; + logic [63:0] twoN; + logic SignD; + logic SignN; + logic [63:0] QT, remT; + logic D_NegOne; + logic Max_N; + + // Check if negative (two's complement) + // If so, convert to positive + adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD); + adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN); + assign SignD = D[63]; + assign SignN = N[63]; + // Max N and D = -1 (Overflow) + assign Max_N = (~|N[62:0]) & N[63]; + assign D_NegOne = &D; + + // Divider goes the distance to 37 cycles + // (thanks to the evil divisor for D = 0x1) + + // Shift D, if needed (for integer) + // needed to allow qst to be in range for integer + // division [1,2) and allow integer divide to work. + // + // The V or valid bit can be used to determine if D + // is 0 and thus a divide by 0 exception. This div0 + // exception is given to FSM to tell the operation to + // quit gracefully. + + lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD)); + shift_left #(64) p2 (twoD, P, op2); + assign op1 = twoN; + assign div0 = ~V; + + // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0) + // v = 2 since \rho < 1 (add 4 to make sure its a ceil) + adder #(8) cpa3 ({2'b0, P}, + {5'h0, shiftResult, ~shiftResult, 1'b0}, + Num); + + // Determine whether need to add just Q/Rem + assign shiftResult = P[0]; + // div by 2 (ceil) + assign NumIter = Num[6:1]; + assign RemShift = P; + + // FSM to control integer divider + // assume inputs are postive edge and + // datapath (divider) is negative edge + fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv, + start, div0, NumIter, ~clk, reset); + + flopr #(1) rega (~clk, reset, donev, done); + flopr #(1) regb (~clk, reset, divdonev, divdone); + flopr #(1) regc (~clk, reset, otfzerov, otfzero); + flopr #(1) regd (~clk, reset, enablev, enable); + flopr #(1) rege (~clk, reset, state0v, state0); + flopr #(1) regf (~clk, reset, divBusyv, divBusy); + + // To obtain a correct remainder the last bit of the + // quotient has to be aligned with a radix-r boundary. + // Since the quotient is in the range 1/2 < q < 2 (one + // integer bit and m fractional bits), this is achieved by + // shifting N right by v+s so that (m+v+s) mod k = 0. And, + // the quotient has to be aligned to the integer position. + + divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, + enable, otfzero, shiftResult); + + // Storage registers to hold contents stable + flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2); + flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2); + + // Probably not needed - just assigns results + assign Q = Qd2[63:0]; + assign Rem5 = Rd2[64:1]; + + // Adjust remainder by m + shift_right #(64) p4 (Rem5, RemShift, rem0); + + // Adjust Q/Rem for Signed + assign tcQ = (SignN ^ SignD) & S; + assign tcR = SignN & S; + // Signed Divide + // - When N and D are negative: Remainder is negative (undergoes a two's complement). + // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement). + // - When D is negative: Quotient is negative (undergoes a two's complement). + adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT); + adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT); + + // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec) + exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf); + +endmodule // int32div + +module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, + enable, otfzero, shiftResult); + + input logic [63:0] op1, op2; + input logic clk, state0; + input logic reset; + input logic enable; + input logic otfzero; + input logic shiftResult; + + output logic [64:0] rem0; + output logic [64:0] Q; + output logic [3:0] quotient; + + logic [67:0] Sum, Carry; + logic [64:0] Qstar; + logic [64:0] QMstar; + logic [7:0] qtotal; + logic [67:0] SumN, CarryN, SumN2, CarryN2; + logic [67:0] divi1, divi2, divi1c, divi2c, dive1; + logic [67:0] mdivi_temp, mdivi; + logic zero; + logic [1:0] qsel; + logic [1:0] Qin, QMin; + logic CshiftQ, CshiftQM; + logic [67:0] rem1, rem2, rem3; + logic [67:0] SumR, CarryR; + logic [64:0] Qt; + + // Create one's complement values of Divisor (for q*D) + assign divi1 = {3'h0, op2, 1'b0}; + assign divi2 = {2'h0, op2, 2'b0}; + assign divi1c = ~divi1; + assign divi2c = ~divi2; + // Shift x1 if not mod k + mux2 #(68) mx1 ({3'b000, op1, 1'b0}, {4'h0, op1}, shiftResult, dive1); + + // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D) + mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN); + mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN); + // Simplify QST + adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal); + // q = {+2, +1, -1, -2} else q = 0 + qst4 pd1 (qtotal[7:1], divi1[63:61], quotient); + assign ulp = quotient[2]|quotient[3]; + assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]); + // Map to binary encoding + assign qsel[1] = quotient[3]|quotient[2]; + assign qsel[0] = quotient[3]|quotient[1]; + mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp); + mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi); + csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry); + // regs : save CSA + flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2); + flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2); + // OTF + ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM); + otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, + otfzero, enable, Qstar, QMstar); + + // Correction and generation of Remainder + adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1); + // Add back +D as correction + csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR); + adder #(68) cpa3 (SumR, CarryR, rem2); + // Choose remainder (Rem or Rem+D) + mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3); + // Choose correct Q or QM + mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt); + // Final results + assign rem0 = rem3[64:0]; + assign Q = Qt; + +endmodule // divide4x64 + +module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM); + + input logic [3:0] quot; + + output logic [1:0] Qin; + output logic [1:0] QMin; + output logic CshiftQ; + output logic CshiftQM; + + // Load/Store Control for OTF + assign Qin[1] = (quot[1]) | (quot[3]) | (quot[0]); + assign Qin[0] = (quot[1]) | (quot[2]); + assign QMin[1] = (quot[1]) | (!quot[3]&!quot[2]&!quot[1]&!quot[0]); + assign QMin[0] = (quot[3]) | (quot[0]) | + (!quot[3]&!quot[2]&!quot[1]&!quot[0]); + assign CshiftQ = (quot[1]) | (quot[0]); + assign CshiftQM = (quot[3]) | (quot[2]); + +endmodule + +// On-the-fly Conversion per Ercegovac/Lang + +module otf #(parameter WIDTH=8) + (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q); + + input logic [1:0] Qin, QMin; + input logic CshiftQ, CshiftQM; + input logic clk; + input logic reset; + input logic enable; + + output logic [WIDTH-1:0] R2Q; + output logic [WIDTH-1:0] R1Q; + + logic [WIDTH-1:0] Qstar, QMstar; + logic [WIDTH-1:0] M1Q, M2Q; + + // QM + mux2 #(WIDTH) m1 (QMstar, Qstar, CshiftQM, M1Q); + flopenr #(WIDTH) r1 (clk, reset, enable, {M1Q[WIDTH-3:0], QMin}, R1Q); + // Q + mux2 #(WIDTH) m2 (Qstar, QMstar, CshiftQ, M2Q); + flopenr #(WIDTH) r2 (clk, reset, enable, {M2Q[WIDTH-3:0], Qin}, R2Q); + + assign Qstar = R2Q; + assign QMstar = R1Q; + +endmodule // otf8 + +module adder #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, + output logic [WIDTH-1:0] y); + + assign y = a + b; + +endmodule // adder + +module fa (input logic a, b, c, output logic sum, carry); + + assign sum = a^b^c; + assign carry = a&b|a&c|b&c; + +endmodule // fa + +module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c, + output logic [WIDTH-1:0] sum, carry); + + logic [WIDTH:0] carry_temp; + genvar i; + generate + for (i=0;i B. LT and GT are both '0' if A = B. + +module magcompare2b (LT, GT, A, B); + + input logic [1:0] A; + input logic [1:0] B; + + output logic LT; + output logic GT; + + // Determine if A < B using a minimized sum-of-products expression + assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0]; + // Determine if A > B using a minimized sum-of-products expression + assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0]; + +endmodule // magcompare2b + +// J. E. Stine and M. J. Schulte, "A combined two's complement and +// floating-point comparator," 2005 IEEE International Symposium on +// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. +// doi: 10.1109/ISCAS.2005.1464531 + +module magcompare8 (LT, EQ, A, B); + + input logic [7:0] A; + input logic [7:0] B; + + logic [3:0] s; + logic [3:0] t; + logic [1:0] u; + logic [1:0] v; + logic GT; + //wire LT; + + output logic EQ; + output logic LT; + + magcompare2b mag1 (s[0], t[0], A[1:0], B[1:0]); + magcompare2b mag2 (s[1], t[1], A[3:2], B[3:2]); + magcompare2b mag3 (s[2], t[2], A[5:4], B[5:4]); + magcompare2b mag4 (s[3], t[3], A[7:6], B[7:6]); + + magcompare2b mag5 (u[0], v[0], t[1:0], s[1:0]); + magcompare2b mag6 (u[1], v[1], t[3:2], s[3:2]); + + magcompare2b mag7 (LT, GT, v[1:0], u[1:0]); + + assign EQ = ~(GT | LT); + +endmodule // magcompare8 + +module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); + + input logic [63:0] Q; + input logic [63:0] rem; + input logic [63:0] op1; + input logic S; + input logic div0; + input logic Max_N; + input logic D_NegOne; + + output logic [63:0] Qf; + output logic [63:0] remf; + + // Needs to be optimized + always_comb + case ({div0, S, Max_N, D_NegOne}) + 4'b0000 : Qf = Q; + 4'b0001 : Qf = Q; + 4'b0010 : Qf = Q; + 4'b0011 : Qf = Q; + 4'b0100 : Qf = Q; + 4'b0101 : Qf = Q; + 4'b0110 : Qf = Q; + 4'b0111 : Qf = {1'b1, 31'h0}; + 4'b1000 : Qf = {64{1'b1}}; + 4'b1001 : Qf = {64{1'b1}}; + 4'b1010 : Qf = {64{1'b1}}; + 4'b1011 : Qf = {64{1'b1}}; + 4'b1100 : Qf = {64{1'b1}}; + 4'b1101 : Qf = {64{1'b1}}; + 4'b1110 : Qf = {64{1'b1}}; + 4'b1111 : Qf = {64{1'b1}}; + default: Qf = Q; + endcase + + always_comb + case ({div0, S, Max_N, D_NegOne}) + 4'b0000 : remf = rem; + 4'b0001 : remf = rem; + 4'b0010 : remf = rem; + 4'b0011 : remf = rem; + 4'b0100 : remf = rem; + 4'b0101 : remf = rem; + 4'b0110 : remf = rem; + 4'b0111 : remf = 64'h0; + 4'b1000 : remf = op1; + 4'b1001 : remf = op1; + 4'b1010 : remf = op1; + 4'b1011 : remf = op1; + 4'b1100 : remf = op1; + 4'b1101 : remf = op1; + 4'b1110 : remf = op1; + 4'b1111 : remf = op1; + default: remf = rem; + endcase + +endmodule // exception_int + +/* verilator lint_on COMBDLY */ +/* verilator lint_on IMPLICIT */ + diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv index 4266ae61a..107b002f6 100755 --- a/wally-pipelined/src/muldiv/div.sv +++ b/wally-pipelined/src/muldiv/div.sv @@ -1,5 +1,5 @@ /////////////////////////////////////////// -// mul.sv +// divide4x64.sv // // Written: James.Stine@okstate.edu 1 February 2021 // Modified: @@ -29,54 +29,53 @@ /* verilator lint_off COMBDLY */ /* verilator lint_off IMPLICIT */ -`include "wally-config.vh" +module intdiv #(parameter WIDTH=64) + (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); -module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); - - input logic [63:0] N, D; - input logic clk; - input logic reset; - input logic start; - input logic S; + input logic [WIDTH-1:0] N, D; + input logic clk; + input logic reset; + input logic start; + input logic S; + + output logic [WIDTH-1:0] Qf; + output logic [WIDTH-1:0] remf; + output logic div0; + output logic done; + output logic divBusy; + + logic enable; + logic state0; + logic V; + logic [$clog2(WIDTH):0] Num; + logic [$clog2(WIDTH)-1:0] P, NumIter, RemShift; + logic [WIDTH-1:0] op1, op2, op1shift, Rem5; + logic [WIDTH:0] Qd, Rd, Qd2, Rd2; + logic [WIDTH-1:0] Q, rem0; + logic [3:0] quotient; + logic otfzero; + logic shiftResult; + logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; + + logic [WIDTH-1:0] twoD; + logic [WIDTH-1:0] twoN; + logic SignD; + logic SignN; + logic [WIDTH-1:0] QT, remT; + logic D_NegOne; + logic Max_N; - output logic [63:0] Qf; - output logic [63:0] remf; - output logic div0; - output logic done; - output logic divBusy; - - logic divdone; - logic enable; - logic state0; - logic V; - logic [7:0] Num; - logic [5:0] P, NumIter, RemShift; - logic [63:0] op1, op2, op1shift, Rem5; - logic [64:0] Qd, Rd, Qd2, Rd2; - logic [63:0] Q, rem0; - logic [3:0] quotient; - logic otfzero; - logic shiftResult; - logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; - - logic [63:0] twoD; - logic [63:0] twoN; - logic SignD; - logic SignN; - logic [63:0] QT, remT; - logic D_NegOne; - logic Max_N; // Check if negative (two's complement) // If so, convert to positive - adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD); - adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN); - assign SignD = D[63]; - assign SignN = N[63]; + adder #(WIDTH) cpa1 ((D ^ {WIDTH{D[WIDTH-1]&S}}), {{WIDTH-1{1'b0}}, D[WIDTH-1]&S}, twoD); + adder #(WIDTH) cpa2 ((N ^ {WIDTH{N[WIDTH-1]&S}}), {{WIDTH-1{1'b0}}, N[WIDTH-1]&S}, twoN); + assign SignD = D[WIDTH-1]; + assign SignN = N[WIDTH-1]; // Max N and D = -1 (Overflow) - assign Max_N = (~|N[62:0]) & N[63]; + assign Max_N = (~|N[WIDTH-2:0]) & N[WIDTH-1]; assign D_NegOne = &D; - + // Divider goes the distance to 37 cycles // (thanks to the evil divisor for D = 0x1) @@ -89,31 +88,31 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); // exception is given to FSM to tell the operation to // quit gracefully. - lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD)); - shift_left #(64) p2 (twoD, P, op2); - assign op1 = twoN; + lzd_hier #(WIDTH) p1 (.ZP(P), .ZV(V), .B(twoD)); + shift_left #(WIDTH) p2 (twoD, P, op2); + assign op1 = twoN; assign div0 = ~V; - // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0) + // #iter: N = m+v+s = m+2+s (mod k = 0) // v = 2 since \rho < 1 (add 4 to make sure its a ceil) - adder #(8) cpa3 ({2'b0, P}, - {5'h0, shiftResult, ~shiftResult, 1'b0}, - Num); + // k = 2 (r = 2^k) + adder #($clog2(WIDTH)+1) cpa3 ({1'b0, P}, + {{$clog2(WIDTH)+1-3{1'b0}}, shiftResult, ~shiftResult, 1'b0}, + Num); // Determine whether need to add just Q/Rem assign shiftResult = P[0]; // div by 2 (ceil) - assign NumIter = Num[6:1]; + assign NumIter = Num[$clog2(WIDTH):1]; assign RemShift = P; // FSM to control integer divider // assume inputs are postive edge and // datapath (divider) is negative edge - fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv, - start, div0, NumIter, ~clk, reset); + fsm64 #($clog2(WIDTH)) fsm1 (enablev, state0v, donev, otfzerov, divBusyv, + start, div0, NumIter, ~clk, reset); flopr #(1) rega (~clk, reset, donev, done); - flopr #(1) regb (~clk, reset, divdonev, divdone); flopr #(1) regc (~clk, reset, otfzerov, otfzero); flopr #(1) regd (~clk, reset, enablev, enable); flopr #(1) rege (~clk, reset, state0v, state0); @@ -125,64 +124,66 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); // integer bit and m fractional bits), this is achieved by // shifting N right by v+s so that (m+v+s) mod k = 0. And, // the quotient has to be aligned to the integer position. - - divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, - enable, otfzero, shiftResult); + divide4 #(WIDTH) p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, + enable, otfzero, shiftResult); // Storage registers to hold contents stable - flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2); - flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2); + flopenr #(WIDTH+1) reg3 (clk, reset, enable, Rd, Rd2); + flopenr #(WIDTH+1) reg4 (clk, reset, enable, Qd, Qd2); // Probably not needed - just assigns results - assign Q = Qd2[63:0]; - assign Rem5 = Rd2[64:1]; + assign Q = Qd2[WIDTH-1:0]; + assign Rem5 = Rd2[WIDTH:1]; - // Adjust remainder by m - shift_right #(64) p4 (Rem5, RemShift, rem0); + // Adjust remainder by m (no need to adjust by + shift_right #(WIDTH) p4 (Rem5, RemShift, rem0); // Adjust Q/Rem for Signed assign tcQ = (SignN ^ SignD) & S; assign tcR = SignN & S; - // Signed Divide + + // When Dividend (N) and/or Divisor (D) are negative (first bit is '1'): // - When N and D are negative: Remainder is negative (undergoes a two's complement). // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement). // - When D is negative: Quotient is negative (undergoes a two's complement). - adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT); - adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT); + adder #(WIDTH) cpa4 ((rem0 ^ {WIDTH{tcR}}), {{WIDTH-1{1'b0}}, tcR}, remT); + adder #(WIDTH) cpa5 ((Q ^ {WIDTH{tcQ}}), {{WIDTH-1{1'b0}}, tcQ}, QT); // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec) - exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf); - + exception_int #(WIDTH) exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf); + endmodule // int32div -module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, - enable, otfzero, shiftResult); +// Division by Recurrence (r=4) +module divide4 #(parameter WIDTH=64) + (Q, rem0, quotient, op1, op2, clk, reset, state0, + enable, otfzero, shiftResult); - input logic [63:0] op1, op2; - input logic clk, state0; - input logic reset; - input logic enable; - input logic otfzero; - input logic shiftResult; + input logic [WIDTH-1:0] op1, op2; + input logic clk, state0; + input logic reset; + input logic enable; + input logic otfzero; + input logic shiftResult; - output logic [64:0] rem0; - output logic [64:0] Q; - output logic [3:0] quotient; + output logic [WIDTH:0] rem0; + output logic [WIDTH:0] Q; + output logic [3:0] quotient; - logic [67:0] Sum, Carry; - logic [64:0] Qstar; - logic [64:0] QMstar; - logic [7:0] qtotal; - logic [67:0] SumN, CarryN, SumN2, CarryN2; - logic [67:0] divi1, divi2, divi1c, divi2c, dive1; - logic [67:0] mdivi_temp, mdivi; - logic zero; - logic [1:0] qsel; - logic [1:0] Qin, QMin; - logic CshiftQ, CshiftQM; - logic [67:0] rem1, rem2, rem3; - logic [67:0] SumR, CarryR; - logic [64:0] Qt; + logic [WIDTH+3:0] Sum, Carry; + logic [WIDTH:0] Qstar; + logic [WIDTH:0] QMstar; + logic [7:0] qtotal; + logic [WIDTH+3:0] SumN, CarryN, SumN2, CarryN2; + logic [WIDTH+3:0] divi1, divi2, divi1c, divi2c, dive1; + logic [WIDTH+3:0] mdivi_temp, mdivi; + logic zero; + logic [1:0] qsel; + logic [1:0] Qin, QMin; + logic CshiftQ, CshiftQM; + logic [WIDTH+3:0] rem1, rem2, rem3; + logic [WIDTH+3:0] SumR, CarryR; + logic [WIDTH:0] Qt; // Create one's complement values of Divisor (for q*D) assign divi1 = {3'h0, op2, 1'b0}; @@ -190,42 +191,42 @@ module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, assign divi1c = ~divi1; assign divi2c = ~divi2; // Shift x1 if not mod k - mux2 #(68) mx1 ({3'b000, op1, 1'b0}, {4'h0, op1}, shiftResult, dive1); + mux2 #(WIDTH+4) mx1 ({3'b000, op1, 1'b0}, {4'h0, op1}, shiftResult, dive1); // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D) - mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN); - mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN); + mux2 #(WIDTH+4) mx2 ({CarryN2[WIDTH+1:0], 2'h0}, {WIDTH+4{1'b0}}, state0, CarryN); + mux2 #(WIDTH+4) mx3 ({SumN2[WIDTH+1:0], 2'h0}, dive1, state0, SumN); // Simplify QST - adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal); + adder #(8) cpa1 (SumN[WIDTH+3:WIDTH-4], CarryN[WIDTH+3:WIDTH-4], qtotal); // q = {+2, +1, -1, -2} else q = 0 - qst4 pd1 (qtotal[7:1], divi1[63:61], quotient); + qst4 pd1 (qtotal[7:1], divi1[WIDTH-1:WIDTH-3], quotient); assign ulp = quotient[2]|quotient[3]; assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]); // Map to binary encoding assign qsel[1] = quotient[3]|quotient[2]; assign qsel[0] = quotient[3]|quotient[1]; - mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp); - mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi); - csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry); + mux4 #(WIDTH+4) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp); + mux2 #(WIDTH+4) mx5 (mdivi_temp, {WIDTH+4{1'b0}}, zero, mdivi); + csa #(WIDTH+4) csa1 (mdivi, SumN, {CarryN[WIDTH+3:1], ulp}, Sum, Carry); // regs : save CSA - flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2); - flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2); + flopenr #(WIDTH+4) reg1 (clk, reset, enable, Sum, SumN2); + flopenr #(WIDTH+4) reg2 (clk, reset, enable, Carry, CarryN2); // OTF ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM); - otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, - otfzero, enable, Qstar, QMstar); + otf #(WIDTH+1) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, + otfzero, enable, Qstar, QMstar); // Correction and generation of Remainder - adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1); + adder #(WIDTH+4) cpa2 (SumN2[WIDTH+3:0], CarryN2[WIDTH+3:0], rem1); // Add back +D as correction - csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR); - adder #(68) cpa3 (SumR, CarryR, rem2); + csa #(WIDTH+4) csa2 (CarryN2[WIDTH+3:0], SumN2[WIDTH+3:0], divi1, SumR, CarryR); + adder #(WIDTH+4) cpa3 (SumR, CarryR, rem2); // Choose remainder (Rem or Rem+D) - mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3); + mux2 #(WIDTH+4) mx6 (rem1, rem2, rem1[WIDTH+3], rem3); // Choose correct Q or QM - mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt); + mux2 #(WIDTH+1) mx7 (Qstar, QMstar, rem1[WIDTH+3], Qt); // Final results - assign rem0 = rem3[64:0]; + assign rem0 = rem3[WIDTH:0]; assign Q = Qt; endmodule // divide4x64 @@ -304,10 +305,9 @@ module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c, fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]); end endgenerate - //assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0}; // trmimmed excess bit dh 5/3/21 - assign carry = {carry_temp[WIDTH-1:1], 1'b0}; + assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0}; -endmodule // adder +endmodule // csa module eqcmp #(parameter WIDTH = 8) (input logic [WIDTH-1:0] a, b, @@ -490,26 +490,24 @@ module lz64 (ZP, ZV, B); endmodule // lz64 // FSM Control for Integer Divider -module fsm64 (en, state0, done, divdone, otfzero, divBusy, - start, error, NumIter, clk, reset); +module fsm64 #(parameter WIDTH=6) + (en, state0, done, otfzero, divBusy, start, error, NumIter, clk, reset); - input logic [5:0] NumIter; - input logic clk; - input logic reset; - input logic start; - input logic error; + input logic [WIDTH-1:0] NumIter; + input logic clk; + input logic reset; + input logic start; + input logic error; - output logic done; - output logic en; - output logic state0; - output logic divdone; - output logic otfzero; - output logic divBusy; + output logic done; + output logic en; + output logic state0; + output logic otfzero; + output logic divBusy; - logic LT, EQ; - logic Divide0; - logic [5:0] CURRENT_STATE; - logic [5:0] NEXT_STATE; + logic LT, EQ; + logic [5:0] CURRENT_STATE; + logic [5:0] NEXT_STATE; parameter [5:0] S0=6'd0, S1=6'd1, S2=6'd2, @@ -534,12 +532,8 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, CURRENT_STATE<=NEXT_STATE; end - // Going to cheat and hard code number of states - // needed into FSM instead of using a counter - // FIXME: could counter be better - // Cheated and made 8 - let synthesis do its magic - magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {2'h0, NumIter}); + magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {{8-WIDTH{1'b0}}, NumIter}); always @(CURRENT_STATE or start) begin @@ -552,7 +546,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; divBusy = 1'b0; state0 = 1'b0; - divdone = 1'b0; done = 1'b0; NEXT_STATE <= S0; end @@ -560,30 +553,21 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, begin otfzero = 1'b0; en = 1'b1; - divBusy = 1'b1; + divBusy = 1'b1; state0 = 1'b1; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; done = 1'b0; - divdone = 1'b0; NEXT_STATE <= S1; end end S1: begin - otfzero = 1'b0; - divBusy = 1'b1; + otfzero = 1'b0; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S2; end else @@ -591,8 +575,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S2; + NEXT_STATE <= S36; end end // case: S1 S2: @@ -604,10 +587,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S3; end // if (LT|EQ) else @@ -615,8 +594,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S3; + NEXT_STATE <= S36; end end // case: S2 S3: @@ -628,10 +606,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S4; end else @@ -639,8 +613,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S4; + NEXT_STATE <= S36; end end // case: S3 S4: @@ -652,10 +625,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S5; end else @@ -663,8 +632,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S5; + NEXT_STATE <= S36; end end // case: S4 S5: @@ -676,10 +644,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S6; end // if (LT|EQ) else @@ -687,8 +651,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S6; + NEXT_STATE <= S36; end end // case: S5 S6: @@ -700,10 +663,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S7; end // if (LT|EQ) else @@ -711,8 +670,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S7; + NEXT_STATE <= S36; end end // case: S6 S7: @@ -724,10 +682,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S8; end // if (LT|EQ) else @@ -735,8 +689,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S8; + NEXT_STATE <= S36; end end // case: S7 S8: @@ -748,10 +701,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S9; end // if (LT|EQ) else @@ -759,8 +708,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S9; + NEXT_STATE <= S36; end end // case: S8 S9: @@ -772,10 +720,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S10; end // if (LT|EQ) else @@ -783,8 +727,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S10; + NEXT_STATE <= S36; end end // case: S9 S10: @@ -796,10 +739,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S11; end // if (LT|EQ) else @@ -807,8 +746,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S11; + NEXT_STATE <= S36; end end // case: S10 S11: @@ -820,10 +758,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S12; end // if (LT|EQ) else @@ -831,8 +765,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S12; + NEXT_STATE <= S36; end end // case: S11 S12: @@ -844,10 +777,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S13; end // if (LT|EQ) else @@ -855,8 +784,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S13; + NEXT_STATE <= S36; end end // case: S12 S13: @@ -868,10 +796,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S14; end // if (LT|EQ) else @@ -879,23 +803,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S14; + NEXT_STATE <= S36; end end // case: S13 S14: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S15; end // if (LT|EQ) else @@ -903,23 +822,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S15; + NEXT_STATE <= S36; end end // case: S14 S15: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S16; end // if (LT|EQ) else @@ -927,23 +841,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S16; + NEXT_STATE <= S36; end end // case: S15 S16: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S17; end // if (LT|EQ) else @@ -951,23 +860,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S17; + NEXT_STATE <= S36; end end // case: S16 S17: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S18; end // if (LT|EQ) else @@ -975,23 +879,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S18; + NEXT_STATE <= S36; end end // case: S17 S18: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S19; end // if (LT|EQ) else @@ -999,23 +898,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S19; + NEXT_STATE <= S36; end end // case: S18 S19: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S20; end // if (LT|EQ) else @@ -1023,23 +917,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S20; + NEXT_STATE <= S36; end end // case: S19 S20: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S21; end // if (LT|EQ) else @@ -1047,23 +936,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S21; + NEXT_STATE <= S36; end end // case: S20 S21: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S22; end // if (LT|EQ) else @@ -1071,23 +955,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S22; + NEXT_STATE <= S36; end end // case: S21 S22: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S23; end // if (LT|EQ) else @@ -1095,23 +974,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S23; + NEXT_STATE <= S36; end end // case: S22 S23: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S24; end // if (LT|EQ) else @@ -1119,23 +993,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S24; + NEXT_STATE <= S36; end end // case: S23 S24: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S25; end // if (LT|EQ) else @@ -1143,23 +1012,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S25; + NEXT_STATE <= S36; end end // case: S24 S25: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S26; end // if (LT|EQ) else @@ -1167,23 +1031,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S26; + NEXT_STATE <= S36; end end // case: S25 S26: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S27; end // if (LT|EQ) else @@ -1191,23 +1050,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S27; + NEXT_STATE <= S36; end end // case: S26 S27: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S28; end // if (LT|EQ) else @@ -1215,23 +1069,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S28; + NEXT_STATE <= S36; end end // case: S27 S28: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S29; end // if (LT|EQ) else @@ -1239,23 +1088,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S29; + NEXT_STATE <= S36; end end // case: S28 S29: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S30; end // if (LT|EQ) else @@ -1263,23 +1107,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S30; + NEXT_STATE <= S36; end end // case: S29 S30: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S31; end // if (LT|EQ) else @@ -1287,8 +1126,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S31; + NEXT_STATE <= S36; end end // case: S30 S31: @@ -1300,10 +1138,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S32; end // if (LT|EQ) else @@ -1311,8 +1145,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S32; + NEXT_STATE <= S36; end end // case: S31 S32: @@ -1324,10 +1157,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S33; end // if (LT|EQ) else @@ -1335,8 +1164,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S33; + NEXT_STATE <= S36; end end // case: S32 S33: @@ -1348,10 +1176,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S34; end // if (LT|EQ) else @@ -1359,23 +1183,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S34; + NEXT_STATE <= S36; end end // case: S33 S34: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b1; if (LT|EQ) begin en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S35; end // if (LT|EQ) else @@ -1383,8 +1202,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; - NEXT_STATE <= S35; + NEXT_STATE <= S36; end end // case: S34 S35: @@ -1396,10 +1214,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b1; state0 = 1'b0; done = 1'b0; - if (EQ) - divdone = 1'b1; - else - divdone = 1'b0; NEXT_STATE <= S36; end // if (LT|EQ) else @@ -1407,7 +1221,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; NEXT_STATE <= S36; end end // case: S35 @@ -1419,12 +1232,10 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, done = 1'b1; if (EQ) begin - divdone = 1'b1; en = 1'b1; end else begin - divdone = 1'b0; en = 1'b0; end NEXT_STATE <= S0; @@ -1432,11 +1243,10 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy, default: begin otfzero = 1'b0; - divBusy = 1'b1; + divBusy = 1'b0; en = 1'b0; state0 = 1'b0; done = 1'b0; - divdone = 1'b0; NEXT_STATE <= S0; end endcase // case(CURRENT_STATE) @@ -1497,38 +1307,39 @@ module magcompare8 (LT, EQ, A, B); endmodule // magcompare8 -module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); +// RISC-V Exception Logic for Divide by 0 and Overflow (Signed Integer Divide) +module exception_int #(parameter WIDTH=8) + (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); - input logic [63:0] Q; - input logic [63:0] rem; - input logic [63:0] op1; - input logic S; - input logic div0; - input logic Max_N; - input logic D_NegOne; + input logic [WIDTH-1:0] Q; + input logic [WIDTH-1:0] rem; + input logic [WIDTH-1:0] op1; + input logic S; + input logic div0; + input logic Max_N; + input logic D_NegOne; - output logic [63:0] Qf; - output logic [63:0] remf; + output logic [WIDTH-1:0] Qf; + output logic [WIDTH-1:0] remf; - // Needs to be optimized always_comb case ({div0, S, Max_N, D_NegOne}) 4'b0000 : Qf = Q; 4'b0001 : Qf = Q; - 4'b0010 : Qf = Q; - 4'b0011 : Qf = Q; + 4'b0010 : Qf = Q; + 4'b0011 : Qf = Q; 4'b0100 : Qf = Q; - 4'b0101 : Qf = Q; + 4'b0101 : Qf = Q; 4'b0110 : Qf = Q; - 4'b0111 : Qf = {1'b1, 31'h0}; - 4'b1000 : Qf = {64{1'b1}}; - 4'b1001 : Qf = {64{1'b1}}; - 4'b1010 : Qf = {64{1'b1}}; - 4'b1011 : Qf = {64{1'b1}}; - 4'b1100 : Qf = {64{1'b1}}; - 4'b1101 : Qf = {64{1'b1}}; - 4'b1110 : Qf = {64{1'b1}}; - 4'b1111 : Qf = {64{1'b1}}; + 4'b0111 : Qf = {1'b1, {WIDTH-1{1'h0}}}; + 4'b1000 : Qf = {WIDTH{1'b1}}; + 4'b1001 : Qf = {WIDTH{1'b1}}; + 4'b1010 : Qf = {WIDTH{1'b1}}; + 4'b1011 : Qf = {WIDTH{1'b1}}; + 4'b1100 : Qf = {WIDTH{1'b1}}; + 4'b1101 : Qf = {WIDTH{1'b1}}; + 4'b1110 : Qf = {WIDTH{1'b1}}; + 4'b1111 : Qf = {WIDTH{1'b1}}; default: Qf = Q; endcase @@ -1536,18 +1347,18 @@ module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); case ({div0, S, Max_N, D_NegOne}) 4'b0000 : remf = rem; 4'b0001 : remf = rem; - 4'b0010 : remf = rem; + 4'b0010 : remf = rem; 4'b0011 : remf = rem; 4'b0100 : remf = rem; 4'b0101 : remf = rem; 4'b0110 : remf = rem; - 4'b0111 : remf = 64'h0; + 4'b0111 : remf = {WIDTH{1'h0}}; 4'b1000 : remf = op1; 4'b1001 : remf = op1; 4'b1010 : remf = op1; 4'b1011 : remf = op1; 4'b1100 : remf = op1; - 4'b1101 : remf = op1; + 4'b1101 : remf = op1; 4'b1110 : remf = op1; 4'b1111 : remf = op1; default: remf = rem; @@ -1557,4 +1368,3 @@ endmodule // exception_int /* verilator lint_on COMBDLY */ /* verilator lint_on IMPLICIT */ - diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv index 17c4aac54..f4096fd1b 100644 --- a/wally-pipelined/src/muldiv/muldiv.sv +++ b/wally-pipelined/src/muldiv/muldiv.sv @@ -78,7 +78,7 @@ module muldiv ( .en(startDivideE), .clear(DivDoneE), .reset(reset), .clk(~gclk)); assign signedDivide = (Funct3E[2]&~Funct3E[1]&~Funct3E[0]) | (Funct3E[2]&Funct3E[1]&~Funct3E[0]); - div div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide); + intdiv #(`XLEN) div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide); // Added for debugging of start signal for divide assign startDivideE = MulDivE&DivStartE&~DivBusyE; @@ -93,7 +93,6 @@ module muldiv ( // Select result always_comb - // case (DivDoneE ? Funct3E_Q : Funct3E) case (Funct3E) 3'b000: PrelimResultE = ProdE[`XLEN-1:0]; 3'b001: PrelimResultE = ProdE[`XLEN*2-1:`XLEN]; From a71b97e8784a975a939ef5efa71c68b83f4c42c7 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Mon, 31 May 2021 09:16:30 -0400 Subject: [PATCH 07/19] Cosmetic changes on integer divider --- wally-pipelined/src/muldiv/div.sv | 7 ++++--- wally-pipelined/src/muldiv/muldiv.sv | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv index 107b002f6..8b4e0463a 100755 --- a/wally-pipelined/src/muldiv/div.sv +++ b/wally-pipelined/src/muldiv/div.sv @@ -55,7 +55,7 @@ module intdiv #(parameter WIDTH=64) logic [3:0] quotient; logic otfzero; logic shiftResult; - logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; + logic enablev, state0v, donev, oftzerov, divBusyv, ulp; logic [WIDTH-1:0] twoD; logic [WIDTH-1:0] twoN; @@ -231,6 +231,7 @@ module divide4 #(parameter WIDTH=64) endmodule // divide4x64 +// Load/Control for OTFC module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM); input logic [3:0] quot; @@ -251,8 +252,7 @@ module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM); endmodule -// On-the-fly Conversion per Ercegovac/Lang - +// On-the-fly Conversion (OTFC) module otf #(parameter WIDTH=8) (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q); @@ -317,6 +317,7 @@ module eqcmp #(parameter WIDTH = 8) endmodule // eqcmp +// QST for r=4 module qst4 (input logic [6:0] s, input logic [2:0] d, output logic [3:0] q); diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv index f4096fd1b..ccabe341a 100644 --- a/wally-pipelined/src/muldiv/muldiv.sv +++ b/wally-pipelined/src/muldiv/muldiv.sv @@ -47,7 +47,6 @@ module muldiv ( logic [`XLEN-1:0] MulDivResultE, MulDivResultM; logic [`XLEN-1:0] PrelimResultE; logic [`XLEN-1:0] QuotE, RemE; - //logic [`XLEN-1:0] Q, R; logic [`XLEN*2-1:0] ProdE; logic enable_q; From 1db8d0e59cf0e0be114b5eced17ee79295c1d4b6 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Mon, 31 May 2021 16:11:12 -0500 Subject: [PATCH 08/19] may have fixed the global branch history predictor. The solution required a completed rewrite and understanding of how the GHR needs to be speculatively updated and repaired. --- testsBP/simple/header.h | 4 +- testsBP/simple/main.c | 4 +- wally-pipelined/src/ifu/bpred.sv | 11 ++- .../src/ifu/globalHistoryPredictor.sv | 98 +++++++++++++++---- wally-pipelined/src/ifu/gshare.sv | 41 +++++++- 5 files changed, 128 insertions(+), 30 deletions(-) diff --git a/testsBP/simple/header.h b/testsBP/simple/header.h index 6def656f8..f3a62da30 100644 --- a/testsBP/simple/header.h +++ b/testsBP/simple/header.h @@ -5,5 +5,7 @@ int fail(); int simple_csrbr_test(); int lbu_test(); int icache_spill_test(); -void global_hist_test(); +void global_hist_1_space_test(); +void global_hist_2_space_test(); +void global_hist_3_space_test(); #endif diff --git a/testsBP/simple/main.c b/testsBP/simple/main.c index 036a351d1..7bf6b4751 100644 --- a/testsBP/simple/main.c +++ b/testsBP/simple/main.c @@ -2,7 +2,9 @@ int main(){ //int res = icache_spill_test(); - global_hist_test(); + global_hist_3_space_test(); + global_hist_2_space_test(); + global_hist_1_space_test(); int res = 1; if (res < 0) { fail(); diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv index c5b4dde48..9beaa959a 100644 --- a/wally-pipelined/src/ifu/bpred.sv +++ b/wally-pipelined/src/ifu/bpred.sv @@ -90,12 +90,13 @@ module bpred .reset(reset), .*, // Stalls and flushes .LookUpPC(PCNextF), - .Prediction(BPPredF), + .BPPredF(BPPredF), // update - .UpdatePC(PCE), - .UpdateEN(InstrClassE[0] & ~StallE), - .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF), + .BPPredD(BPPredD), + .InstrClassE(InstrClassE), + .BPInstrClassE(BPInstrClassE), .BPPredDirWrongE(BPPredDirWrongE), + .UpdatePC(PCE), .PCSrcE(PCSrcE), .UpdatePrediction(UpdateBPPredE)); end else if (`BPTYPE == "BPGSHARE") begin:Predictor @@ -108,6 +109,8 @@ module bpred // update .UpdatePC(PCE), .UpdateEN(InstrClassE[0] & ~StallE), + .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF), + .BPPredDirWrongE(BPPredDirWrongE), .PCSrcE(PCSrcE), .UpdatePrediction(UpdateBPPredE)); end diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv index fadbf004b..b2ac19911 100644 --- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv +++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv @@ -34,49 +34,108 @@ module globalHistoryPredictor input logic reset, input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, input logic [`XLEN-1:0] LookUpPC, - output logic [1:0] Prediction, + output logic [1:0] BPPredF, // update + input logic [1:0] BPPredD, + input logic [4:0] InstrClassE, + input logic [4:0] BPInstrClassE, + input logic [4:0] BPInstrClassD, + input logic [4:0] BPInstrClassF, + input logic BPPredDirWrongE, + input logic [`XLEN-1:0] UpdatePC, - input logic UpdateEN, PCSrcE, - input logic SpeculativeUpdateEn, BPPredDirWrongE, + input logic PCSrcE, input logic [1:0] UpdatePrediction ); - logic [k-1:0] GHRF, GHRFNext, GHRD, GHRE, GHRLookup; + logic [k+1:0] GHR, GHRNext; + logic [k-1:0] PHTUpdateAdr, PHTUpdateAdr0, PHTUpdateAdr1; + logic PHTUpdateEN; + logic BPClassWrongNonCFI; + logic BPClassWrongCFI; + logic BPClassRightNonCFI; + + +/* -----\/----- EXCLUDED -----\/----- + logic [k-1:0] GHRD, GHRE, GHRLookup; logic FlushedD, FlushedE; + -----/\----- EXCLUDED -----/\----- */ + + + logic [6:0] GHRMuxSel; + logic GHRUpdateEN; + + assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0]; + assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0]; + assign BPClassWrongNonCFI = BPInstrClassE[0] & ~InstrClassE[0]; + assign BPClassRightBPWrong = BPInstrClassE[0] & InstrClassE[0] & BPPredDirWrongE; + assign BPClassRightBPRight = BPInstrClassE[0] & InstrClassE[0] & ~BPPredDirWrongE; + + // GHR update selection, 1 hot encoded. + assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight); - // if the prediction is wrong we need to restore the ghr. - assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : - {Prediction[1], GHRF[k-1:1]}; + assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0]; + assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]); - flopenr #(k) GlobalHistoryRegister(.clk(clk), - .reset(reset), - .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)), - .d(GHRFNext), - .q(GHRF)); + + assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0]; + + + + assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0]; + assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0]; + assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight)); + assign GHRUpdateEN = (| GHRMuxSel[5:1] & ~StallE) | GHRMuxSel[6] & ~StallF; + + // hoping this created a AND-OR mux. + always_comb begin + case (GHRMuxSel) + 7'b000_0001: GHRNext = GHR[k-1+2:0]; // no change + 7'b000_0010: GHRNext = {GHR[k-2+2:0], PCSrcE}; // branch update + 7'b000_0100: GHRNext = {1'b0, GHR[k+1:1]}; // repair 1 + 7'b000_1000: GHRNext = {GHR[k-1+2:1], PCSrcE}; // branch update with mis prediction correction + 7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2 + 7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1 + 7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update + //7'b100_0000: GHRNext = {k+1{1'bx}}; // speculative update + default: GHRNext = GHR[k-1+2:0]; + endcase + end + + flopenr #(k+2) GlobalHistoryRegister(.clk(clk), + .reset(reset), + .en((GHRUpdateEN)), + .d(GHRNext), + .q(GHR)); // if actively updating the GHR at the time of prediction we want to us - // GHRFNext as the lookup rather than GHRF. + // GHRNext as the lookup rather than GHR. - assign GHRLookup = UpdateEN ? GHRFNext : GHRF; + //assign GHRLookup = GHRUpdateEN ? GHRNext : GHR; + assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0]; + assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1]; + assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0; + assign PHTUpdateEN = InstrClassE[0] & ~StallE; + // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT SRAM2P1R1W #(k, 2) PHT(.clk(clk), .reset(reset), - .RA1(GHRF), - .RD1(Prediction), + .RA1(GHR[k-1:0]), + .RD1(BPPredF), .REN1(~StallF), - .WA1(GHRE), + .WA1(PHTUpdateAdr), .WD1(UpdatePrediction), - .WEN1(UpdateEN), + .WEN1(PHTUpdateEN), .BitWEN1(2'b11)); +/* -----\/----- EXCLUDED -----\/----- flopenr #(k) GlobalHistoryRegisterD(.clk(clk), .reset(reset), .en(~StallD & ~FlushedE), - .d(GHRF), + .d(GHR), .q(GHRD)); flopenr #(k) GlobalHistoryRegisterE(.clk(clk), @@ -97,6 +156,7 @@ module globalHistoryPredictor .en(~StallE), .d(FlushE | FlushedD), .q(FlushedE)); + -----/\----- EXCLUDED -----/\----- */ endmodule diff --git a/wally-pipelined/src/ifu/gshare.sv b/wally-pipelined/src/ifu/gshare.sv index 4d31e519b..3cc73be80 100644 --- a/wally-pipelined/src/ifu/gshare.sv +++ b/wally-pipelined/src/ifu/gshare.sv @@ -38,28 +38,32 @@ module gsharePredictor // update input logic [`XLEN-1:0] UpdatePC, input logic UpdateEN, PCSrcE, + input logic SpeculativeUpdateEn, BPPredDirWrongE, input logic [1:0] UpdatePrediction ); - logic [k-1:0] GHRF, GHRFNext; + logic [k-1:0] GHRF, GHRFNext, GHRD, GHRE; //logic [k-1:0] LookUpPCIndexD, LookUpPCIndexE; logic [k-1:0] LookUpPCIndex, UpdatePCIndex; logic [1:0] PredictionMemory; logic DoForwarding, DoForwardingF; logic [1:0] UpdatePredictionF; + logic FlushedD, FlushedE; - assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; + // if the prediction is wrong we need to restore the ghr. + assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : + {Prediction[1], GHRF[k-1:1]}; flopenr #(k) GlobalHistoryRegister(.clk(clk), .reset(reset), - .en(UpdateEN), + .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)), .d(GHRFNext), .q(GHRF)); // for gshare xor the PC with the GHR - assign UpdatePCIndex = GHRFNext ^ UpdatePC[k:1]; + assign UpdatePCIndex = GHRE ^ UpdatePC[k:1]; assign LookUpPCIndex = GHRF ^ LookUpPC[k:1]; // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT // GHR referes to the address that the past k branches points to in the prediction stage @@ -67,7 +71,7 @@ module gsharePredictor SRAM2P1R1W #(k, 2) PHT(.clk(clk), .reset(reset), .RA1(LookUpPCIndex), - .RD1(PredictionMemory), + .RD1(Prediction), .REN1(~StallF), .WA1(UpdatePCIndex), .WD1(UpdatePrediction), @@ -75,6 +79,32 @@ module gsharePredictor .BitWEN1(2'b11)); + flopenr #(k) GlobalHistoryRegisterD(.clk(clk), + .reset(reset), + .en(~StallD & ~FlushedE), + .d(GHRF), + .q(GHRD)); + + flopenr #(k) GlobalHistoryRegisterE(.clk(clk), + .reset(reset), + .en(~StallE & ~ FlushedE), + .d(GHRD), + .q(GHRE)); + + + flopenr #(1) flushedDReg(.clk(clk), + .reset(reset), + .en(~StallD), + .d(FlushD), + .q(FlushedD)); + + flopenr #(1) flushedEReg(.clk(clk), + .reset(reset), + .en(~StallE), + .d(FlushE | FlushedD), + .q(FlushedE)); + +/* -----\/----- EXCLUDED -----\/----- // need to forward when updating to the same address as reading. // first we compare to see if the update and lookup addreses are the same assign DoForwarding = LookUpPCIndex == UpdatePCIndex; @@ -92,6 +122,7 @@ module gsharePredictor .q(UpdatePredictionF)); assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory; + -----/\----- EXCLUDED -----/\----- */ //pipeline for GHR /* -----\/----- EXCLUDED -----\/----- From 927aec34a24fbb0fd7b1e43eb259f82b14b3775f Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Mon, 31 May 2021 23:27:42 -0400 Subject: [PATCH 09/19] Modify muldiv.sv to handle W instructions for 64-bits --- wally-pipelined/src/muldiv/div.sv | 1 - wally-pipelined/src/muldiv/muldiv.sv | 17 ++++++++++++++--- wally-pipelined/testbench/testbench-imperas.sv | 10 +++++----- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv index 8b4e0463a..10af5eee4 100755 --- a/wally-pipelined/src/muldiv/div.sv +++ b/wally-pipelined/src/muldiv/div.sv @@ -87,7 +87,6 @@ module intdiv #(parameter WIDTH=64) // is 0 and thus a divide by 0 exception. This div0 // exception is given to FSM to tell the operation to // quit gracefully. - lzd_hier #(WIDTH) p1 (.ZP(P), .ZV(V), .B(twoD)); shift_left #(WIDTH) p2 (twoD, P, op2); assign op1 = twoN; diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv index ccabe341a..0c26a5df8 100644 --- a/wally-pipelined/src/muldiv/muldiv.sv +++ b/wally-pipelined/src/muldiv/muldiv.sv @@ -53,6 +53,7 @@ module muldiv ( logic [2:0] Funct3E_Q; logic div0error; logic [`XLEN-1:0] N, D; + logic [`XLEN-1:0] Num0, Den0; logic gclk; logic DivStartE; @@ -69,13 +70,23 @@ module muldiv ( end assign gclk = enable_q & clk; + // Handle sign extension for W-type instructions + if (`XLEN == 64) begin // RV64 has W-type instructions + assign Num0 = W64E ? {{32{SrcAE[31]&signedDivide}}, SrcAE[31:0]} : SrcAE; + assign Den0 = W64E ? {{32{SrcBE[31]&signedDivide}}, SrcBE[31:0]} : SrcBE; + end else begin // RV32 has no W-type instructions + assign Num0 = SrcAE; + assign Den0 = SrcAE; + end + // capture the Numerator/Denominator - flopenrc #(`XLEN) reg_num (.d(SrcAE), .q(N), + flopenrc #(`XLEN) reg_num (.d(Num0), .q(N), .en(startDivideE), .clear(DivDoneE), .reset(reset), .clk(~gclk)); - flopenrc #(`XLEN) reg_den (.d(SrcBE), .q(D), + flopenrc #(`XLEN) reg_den (.d(Den0), .q(D), .en(startDivideE), .clear(DivDoneE), - .reset(reset), .clk(~gclk)); + .reset(reset), .clk(~gclk)); + assign signedDivide = (Funct3E[2]&~Funct3E[1]&~Funct3E[0]) | (Funct3E[2]&Funct3E[1]&~Funct3E[0]); intdiv #(`XLEN) div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide); diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index ea6939004..6d8f1049f 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -166,12 +166,12 @@ string tests32f[] = '{ "rv64m/I-MULW-01", "3000", "rv64m/I-DIV-01", "3000", "rv64m/I-DIVU-01", "3000", - //"rv64m/I-DIVUW-01", "3000", - //"rv64m/I-DIVW-01", "3000", + "rv64m/I-DIVUW-01", "3000", + "rv64m/I-DIVW-01", "3000", "rv64m/I-REM-01", "3000", - "rv64m/I-REMU-01", "3000" - //"rv64m/I-REMUW-01", "3000", - //"rv64m/I-REMW-01", "3000" + "rv64m/I-REMU-01", "3000", + "rv64m/I-REMUW-01", "3000", + "rv64m/I-REMW-01", "3000" }; string tests64ic[] = '{ From 5bc2a8b3463f66ac24b547e18f2b75e17dad6827 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 10:57:43 -0500 Subject: [PATCH 10/19] Now have global history working correctly. --- testsBP/crt0/Makefile | 8 +- testsBP/simple/header.h | 1 + testsBP/simple/main.c | 3 +- wally-pipelined/config/rv64BP/wally-config.vh | 6 +- wally-pipelined/src/ifu/bpred.sv | 29 ++-- .../src/ifu/globalHistoryPredictor.sv | 62 ++----- wally-pipelined/src/ifu/gshare.sv | 159 ------------------ .../testbench/testbench-imperas.sv | 5 +- 8 files changed, 38 insertions(+), 235 deletions(-) delete mode 100644 wally-pipelined/src/ifu/gshare.sv diff --git a/testsBP/crt0/Makefile b/testsBP/crt0/Makefile index b42e86cb8..2af43a408 100644 --- a/testsBP/crt0/Makefile +++ b/testsBP/crt0/Makefile @@ -4,12 +4,12 @@ ROOT := .. LIBRARY_DIRS := LIBRARY_FILES := -MARCH :=-march=rv64ic -MABI :=-mabi=lp64 +MARCH :=-march=rv64imfdc +MABI :=-mabi=lp64d LINK_FLAGS :=$(MARCH) $(MABI) -nostartfiles -AFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -W -CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -mcmodel=medany -O2 +AFLAGS =$(MARCH) $(MABI) -W +CFLAGS =$(MARCH) $(MABI) -mcmodel=medany -O2 AS=riscv64-unknown-elf-as CC=riscv64-unknown-elf-gcc AR=riscv64-unknown-elf-ar diff --git a/testsBP/simple/header.h b/testsBP/simple/header.h index f3a62da30..aab8973fd 100644 --- a/testsBP/simple/header.h +++ b/testsBP/simple/header.h @@ -5,6 +5,7 @@ int fail(); int simple_csrbr_test(); int lbu_test(); int icache_spill_test(); +void global_hist_0_space_test(); void global_hist_1_space_test(); void global_hist_2_space_test(); void global_hist_3_space_test(); diff --git a/testsBP/simple/main.c b/testsBP/simple/main.c index 7bf6b4751..564b474e1 100644 --- a/testsBP/simple/main.c +++ b/testsBP/simple/main.c @@ -4,7 +4,8 @@ int main(){ //int res = icache_spill_test(); global_hist_3_space_test(); global_hist_2_space_test(); - global_hist_1_space_test(); + global_hist_1_space_test(); + global_hist_0_space_test(); int res = 1; if (res < 0) { fail(); diff --git a/wally-pipelined/config/rv64BP/wally-config.vh b/wally-pipelined/config/rv64BP/wally-config.vh index fd482bfde..a9dbb1bda 100644 --- a/wally-pipelined/config/rv64BP/wally-config.vh +++ b/wally-pipelined/config/rv64BP/wally-config.vh @@ -32,7 +32,7 @@ `define XLEN 64 //`define MISA (32'h00000105) -`define MISA (32'h00000104 | 1<<5 | 1<<18 | 1 << 20 | 1 << 12 | 1 << 0) +`define MISA (32'h00000104 | 1 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0) `define A_SUPPORTED ((`MISA >> 0) % 2 == 1) `define C_SUPPORTED ((`MISA >> 2) % 2 == 1) `define D_SUPPORTED ((`MISA >> 3) % 2 == 1) @@ -107,8 +107,8 @@ /* verilator lint_off ASSIGNDLY */ /* verilator lint_off PINCONNECTEMPTY */ -`define TWO_BIT_PRELOAD "../config/rv64icfd/twoBitPredictor.txt" -`define BTB_PRELOAD "../config/rv64icfd/BTBPredictor.txt" +`define TWO_BIT_PRELOAD "../config/rv64BP/twoBitPredictor.txt" +`define BTB_PRELOAD "../config/rv64BP/BTBPredictor.txt" `define BPRED_ENABLED 1 //`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE `define BPTYPE "BPGLOBAL" // BPTWOBIT or "BPGSHARE" or BPLOCALPAg or BPGSHARE diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv index 9beaa959a..92471c574 100644 --- a/wally-pipelined/src/ifu/bpred.sv +++ b/wally-pipelined/src/ifu/bpred.sv @@ -89,30 +89,29 @@ module bpred globalHistoryPredictor DirPredictor(.clk(clk), .reset(reset), .*, // Stalls and flushes - .LookUpPC(PCNextF), + .PCNextF(PCNextF), .BPPredF(BPPredF), // update - .BPPredD(BPPredD), .InstrClassE(InstrClassE), .BPInstrClassE(BPInstrClassE), .BPPredDirWrongE(BPPredDirWrongE), - .UpdatePC(PCE), + .PCE(PCE), .PCSrcE(PCSrcE), - .UpdatePrediction(UpdateBPPredE)); + .UpdateBPPredE(UpdateBPPredE)); end else if (`BPTYPE == "BPGSHARE") begin:Predictor gsharePredictor DirPredictor(.clk(clk), - .reset(reset), - .*, // Stalls and flushes - .LookUpPC(PCNextF), - .Prediction(BPPredF), - // update - .UpdatePC(PCE), - .UpdateEN(InstrClassE[0] & ~StallE), - .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF), - .BPPredDirWrongE(BPPredDirWrongE), - .PCSrcE(PCSrcE), - .UpdatePrediction(UpdateBPPredE)); + .reset(reset), + .*, // Stalls and flushes + .PCNextF(PCNextF), + .BPPredF(BPPredF), + // update + .InstrClassE(InstrClassE), + .BPInstrClassE(BPInstrClassE), + .BPPredDirWrongE(BPPredDirWrongE), + .PCE(PCE), + .PCSrcE(PCSrcE), + .UpdateBPPredE(UpdateBPPredE)); end else if (`BPTYPE == "BPLOCALPAg") begin:Predictor diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv index b2ac19911..516de633e 100644 --- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv +++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv @@ -33,19 +33,18 @@ module globalHistoryPredictor (input logic clk, input logic reset, input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, - input logic [`XLEN-1:0] LookUpPC, + input logic [`XLEN-1:0] PCNextF, output logic [1:0] BPPredF, // update - input logic [1:0] BPPredD, input logic [4:0] InstrClassE, input logic [4:0] BPInstrClassE, input logic [4:0] BPInstrClassD, input logic [4:0] BPInstrClassF, input logic BPPredDirWrongE, - input logic [`XLEN-1:0] UpdatePC, + input logic [`XLEN-1:0] PCE, input logic PCSrcE, - input logic [1:0] UpdatePrediction + input logic [1:0] UpdateBPPredE ); logic [k+1:0] GHR, GHRNext; @@ -54,17 +53,10 @@ module globalHistoryPredictor logic BPClassWrongNonCFI; logic BPClassWrongCFI; logic BPClassRightNonCFI; - - -/* -----\/----- EXCLUDED -----\/----- - logic [k-1:0] GHRD, GHRE, GHRLookup; - - logic FlushedD, FlushedE; - -----/\----- EXCLUDED -----/\----- */ - logic [6:0] GHRMuxSel; logic GHRUpdateEN; + logic [k-1:0] GHRLookup; assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0]; assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0]; @@ -75,15 +67,9 @@ module globalHistoryPredictor // GHR update selection, 1 hot encoded. assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight); - assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0]; - assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]); - - assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0]; - - - + assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]); assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0]; assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0]; assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight)); @@ -99,7 +85,6 @@ module globalHistoryPredictor 7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2 7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1 7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update - //7'b100_0000: GHRNext = {k+1{1'bx}}; // speculative update default: GHRNext = GHR[k-1+2:0]; endcase end @@ -113,50 +98,23 @@ module globalHistoryPredictor // if actively updating the GHR at the time of prediction we want to us // GHRNext as the lookup rather than GHR. - //assign GHRLookup = GHRUpdateEN ? GHRNext : GHR; - assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0]; assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1]; assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0; assign PHTUpdateEN = InstrClassE[0] & ~StallE; + + assign GHRLookup = |GHRMuxSel[6:1] ? GHRNext[k-1:0] : GHR[k-1:0]; // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT SRAM2P1R1W #(k, 2) PHT(.clk(clk), .reset(reset), - .RA1(GHR[k-1:0]), + //.RA1(GHR[k-1:0]), + .RA1(GHRLookup), .RD1(BPPredF), .REN1(~StallF), .WA1(PHTUpdateAdr), - .WD1(UpdatePrediction), + .WD1(UpdateBPPredE), .WEN1(PHTUpdateEN), .BitWEN1(2'b11)); -/* -----\/----- EXCLUDED -----\/----- - flopenr #(k) GlobalHistoryRegisterD(.clk(clk), - .reset(reset), - .en(~StallD & ~FlushedE), - .d(GHR), - .q(GHRD)); - - flopenr #(k) GlobalHistoryRegisterE(.clk(clk), - .reset(reset), - .en(~StallE & ~ FlushedE), - .d(GHRD), - .q(GHRE)); - - - flopenr #(1) flushedDReg(.clk(clk), - .reset(reset), - .en(~StallD), - .d(FlushD), - .q(FlushedD)); - - flopenr #(1) flushedEReg(.clk(clk), - .reset(reset), - .en(~StallE), - .d(FlushE | FlushedD), - .q(FlushedE)); - -----/\----- EXCLUDED -----/\----- */ - - endmodule diff --git a/wally-pipelined/src/ifu/gshare.sv b/wally-pipelined/src/ifu/gshare.sv deleted file mode 100644 index 3cc73be80..000000000 --- a/wally-pipelined/src/ifu/gshare.sv +++ /dev/null @@ -1,159 +0,0 @@ -/////////////////////////////////////////// -// gshare.sv -// -// Written: Shreya Sanghai -// Email: ssanghai@hmc.edu -// Created: March 16, 2021 -// Modified: -// -// Purpose: Gshare predictor with parameterized global history register -// -// A component of the Wally configurable RISC-V project. -// -// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, -// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software -// is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT -// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -/////////////////////////////////////////// - -`include "wally-config.vh" - -module gsharePredictor - #(parameter int k = 10 - ) - (input logic clk, - input logic reset, - input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, - input logic [`XLEN-1:0] LookUpPC, - output logic [1:0] Prediction, - // update - input logic [`XLEN-1:0] UpdatePC, - input logic UpdateEN, PCSrcE, - input logic SpeculativeUpdateEn, BPPredDirWrongE, - input logic [1:0] UpdatePrediction - - ); - - logic [k-1:0] GHRF, GHRFNext, GHRD, GHRE; - //logic [k-1:0] LookUpPCIndexD, LookUpPCIndexE; - logic [k-1:0] LookUpPCIndex, UpdatePCIndex; - logic [1:0] PredictionMemory; - logic DoForwarding, DoForwardingF; - logic [1:0] UpdatePredictionF; - logic FlushedD, FlushedE; - - // if the prediction is wrong we need to restore the ghr. - assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : - {Prediction[1], GHRF[k-1:1]}; - - flopenr #(k) GlobalHistoryRegister(.clk(clk), - .reset(reset), - .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)), - .d(GHRFNext), - .q(GHRF)); - - - // for gshare xor the PC with the GHR - assign UpdatePCIndex = GHRE ^ UpdatePC[k:1]; - assign LookUpPCIndex = GHRF ^ LookUpPC[k:1]; - // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT - // GHR referes to the address that the past k branches points to in the prediction stage - // GHRE refers to the address that the past k branches points to in the exectution stage - SRAM2P1R1W #(k, 2) PHT(.clk(clk), - .reset(reset), - .RA1(LookUpPCIndex), - .RD1(Prediction), - .REN1(~StallF), - .WA1(UpdatePCIndex), - .WD1(UpdatePrediction), - .WEN1(UpdateEN), - .BitWEN1(2'b11)); - - - flopenr #(k) GlobalHistoryRegisterD(.clk(clk), - .reset(reset), - .en(~StallD & ~FlushedE), - .d(GHRF), - .q(GHRD)); - - flopenr #(k) GlobalHistoryRegisterE(.clk(clk), - .reset(reset), - .en(~StallE & ~ FlushedE), - .d(GHRD), - .q(GHRE)); - - - flopenr #(1) flushedDReg(.clk(clk), - .reset(reset), - .en(~StallD), - .d(FlushD), - .q(FlushedD)); - - flopenr #(1) flushedEReg(.clk(clk), - .reset(reset), - .en(~StallE), - .d(FlushE | FlushedD), - .q(FlushedE)); - -/* -----\/----- EXCLUDED -----\/----- - // need to forward when updating to the same address as reading. - // first we compare to see if the update and lookup addreses are the same - assign DoForwarding = LookUpPCIndex == UpdatePCIndex; - - // register the update value and the forwarding signal into the Fetch stage - // TODO: add stall logic *** - flopr #(1) DoForwardingReg(.clk(clk), - .reset(reset), - .d(DoForwarding), - .q(DoForwardingF)); - - flopr #(2) UpdatePredictionReg(.clk(clk), - .reset(reset), - .d(UpdatePrediction), - .q(UpdatePredictionF)); - - assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory; - -----/\----- EXCLUDED -----/\----- */ - - //pipeline for GHR -/* -----\/----- EXCLUDED -----\/----- - flopenrc #(k) LookUpDReg(.clk(clk), - .reset(reset), - .en(~StallD), - .clear(FlushD), - .d(LookUpPCIndex), - .q(LookUpPCIndexD)); - - flopenrc #(k) LookUpEReg(.clk(clk), - .reset(reset), - .en(~StallE), - .clear(FlushE), - .d(LookUpPCIndexD), - .q(LookUpPCIndexE)); - -----/\----- EXCLUDED -----/\----- */ - -/* flopenrc #(k) GHRRegD(.clk(clk), - .reset(reset), - .en(~StallD), - .clear(FlushD), - .d(GHRF), - .q(GHRD)); - - flopenrc #(k) GHRRegE(.clk(clk), - .reset(reset), - .en(~StallE), - .clear(FlushE), - .d(GHRD), - .q(GHRE)); - -*/ -endmodule diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index ddee23a1e..bb8ffbd4b 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -438,8 +438,11 @@ string tests32f[] = '{ string testsBP64[] = '{ "rv64BP/simple", "10000", + "rv64BP/mmm", "1000000", + "rv64BP/linpack_bench", "1000000", + "rv64BP/sieve", "1000000", "rv64BP/qsort", "1000000", - "rv64BP/sieve", "1000000" + "rv64BP/dhrystone", "1000000" }; string tests64p[] = '{ From 9a49cf74c33cc74b17c69ad95c13791819aaa104 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 12:14:58 -0500 Subject: [PATCH 11/19] Changed to bp config to use gshare. --- wally-pipelined/config/rv64BP/wally-config.vh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wally-pipelined/config/rv64BP/wally-config.vh b/wally-pipelined/config/rv64BP/wally-config.vh index a9dbb1bda..f85e0c228 100644 --- a/wally-pipelined/config/rv64BP/wally-config.vh +++ b/wally-pipelined/config/rv64BP/wally-config.vh @@ -111,5 +111,5 @@ `define BTB_PRELOAD "../config/rv64BP/BTBPredictor.txt" `define BPRED_ENABLED 1 //`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE -`define BPTYPE "BPGLOBAL" // BPTWOBIT or "BPGSHARE" or BPLOCALPAg or BPGSHARE +`define BPTYPE "BPGSHARE" // BPTWOBIT or "BPGLOBAL" or BPLOCALPAg or BPGSHARE `define TESTSBP 1 From f5aa5d7c67d419da2dc3398336b59041ce1dd2ec Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 12:41:48 -0500 Subject: [PATCH 12/19] Forgot to include the new gshare predictor file. --- wally-pipelined/src/ifu/gsharePredictor.sv | 120 +++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 wally-pipelined/src/ifu/gsharePredictor.sv diff --git a/wally-pipelined/src/ifu/gsharePredictor.sv b/wally-pipelined/src/ifu/gsharePredictor.sv new file mode 100644 index 000000000..b4a608278 --- /dev/null +++ b/wally-pipelined/src/ifu/gsharePredictor.sv @@ -0,0 +1,120 @@ +/////////////////////////////////////////// +// globalHistoryPredictor.sv +// +// Written: Shreya Sanghai +// Email: ssanghai@hmc.edu +// Created: March 16, 2021 +// Modified: +// +// Purpose: Gshare predictor with parameterized global history register +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" + +module gsharePredictor + #(parameter int k = 10 + ) + (input logic clk, + input logic reset, + input logic StallF, StallD, StallE, FlushF, FlushD, FlushE, + input logic [`XLEN-1:0] PCNextF, + output logic [1:0] BPPredF, + // update + input logic [4:0] InstrClassE, + input logic [4:0] BPInstrClassE, + input logic [4:0] BPInstrClassD, + input logic [4:0] BPInstrClassF, + input logic BPPredDirWrongE, + + input logic [`XLEN-1:0] PCE, + input logic PCSrcE, + input logic [1:0] UpdateBPPredE + + ); + logic [k+1:0] GHR, GHRNext; + logic [k-1:0] PHTUpdateAdr, PHTUpdateAdr0, PHTUpdateAdr1; + logic PHTUpdateEN; + logic BPClassWrongNonCFI; + logic BPClassWrongCFI; + logic BPClassRightNonCFI; + + logic [6:0] GHRMuxSel; + logic GHRUpdateEN; + logic [k-1:0] GHRLookup; + + assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0]; + assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0]; + assign BPClassWrongNonCFI = BPInstrClassE[0] & ~InstrClassE[0]; + assign BPClassRightBPWrong = BPInstrClassE[0] & InstrClassE[0] & BPPredDirWrongE; + assign BPClassRightBPRight = BPInstrClassE[0] & InstrClassE[0] & ~BPPredDirWrongE; + + + // GHR update selection, 1 hot encoded. + assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight); + assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0]; + assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0]; + assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]); + assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0]; + assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0]; + assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight)); + assign GHRUpdateEN = (| GHRMuxSel[5:1] & ~StallE) | GHRMuxSel[6] & ~StallF; + + // hoping this created a AND-OR mux. + always_comb begin + case (GHRMuxSel) + 7'b000_0001: GHRNext = GHR[k-1+2:0]; // no change + 7'b000_0010: GHRNext = {GHR[k-2+2:0], PCSrcE}; // branch update + 7'b000_0100: GHRNext = {1'b0, GHR[k+1:1]}; // repair 1 + 7'b000_1000: GHRNext = {GHR[k-1+2:1], PCSrcE}; // branch update with mis prediction correction + 7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2 + 7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1 + 7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update + default: GHRNext = GHR[k-1+2:0]; + endcase + end + + flopenr #(k+2) GlobalHistoryRegister(.clk(clk), + .reset(reset), + .en((GHRUpdateEN)), + .d(GHRNext), + .q(GHR)); + + // if actively updating the GHR at the time of prediction we want to us + // GHRNext as the lookup rather than GHR. + + assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0]; + assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1]; + assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0; + assign PHTUpdateEN = InstrClassE[0] & ~StallE; + + assign GHRLookup = |GHRMuxSel[6:1] ? GHRNext[k-1:0] : GHR[k-1:0]; + + // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT + SRAM2P1R1W #(k, 2) PHT(.clk(clk), + .reset(reset), + //.RA1(GHR[k-1:0]), + .RA1(GHRLookup ^ PCNextF[k:1]), + .RD1(BPPredF), + .REN1(~StallF), + .WA1(PHTUpdateAdr ^ PCE[k:1]), + .WD1(UpdateBPPredE), + .WEN1(PHTUpdateEN), + .BitWEN1(2'b11)); + +endmodule // gsharePredictor From 8e330367acf61f25439dc5b0802c53188b3a6120 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 13:46:21 -0500 Subject: [PATCH 13/19] added clock gater to floating point divider to speed up simulation time. --- wally-pipelined/src/fpu/fpu.sv | 9 ++++- wally-pipelined/src/generic/clockgater.sv | 46 +++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 wally-pipelined/src/generic/clockgater.sv diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv index c876b3131..8362dbe33 100755 --- a/wally-pipelined/src/fpu/fpu.sv +++ b/wally-pipelined/src/fpu/fpu.sv @@ -275,7 +275,14 @@ module fpu ( fma1 fma1 (.*); //first and only instance of floating-point divider - fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .*); + logic fpdivClk; + + clockgater fpdivclkg(.E(FDivStartE), + .SE(DivBusyM), + .CLK(clk), + .ECLK(fpdivClk)); + + fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk)); //first of two-stage instance of floating-point add/cvt unit fpuaddcvt1 fpadd1 (.*); diff --git a/wally-pipelined/src/generic/clockgater.sv b/wally-pipelined/src/generic/clockgater.sv new file mode 100644 index 000000000..dc51829da --- /dev/null +++ b/wally-pipelined/src/generic/clockgater.sv @@ -0,0 +1,46 @@ +/////////////////////////////////////////// +// clockgater.sv +// +// Written: Ross Thompson 9 January 2021 +// Modified: +// +// Purpose: Clock gater model. Must use standard cell for synthesis. +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" + +module clockgater + (input logic E, + input logic SE, + input logic CLK, + output logic ECLK); + + // VERY IMPORTANT. + // This part functionally models a clock gater, but does not necessarily meet the timing constrains a real standard cell would. + // Do not use this in synthesis! + + logic enable_q; + + + always @(E or SE) begin + enable_q <= E | SE; + end + assign ECLK = enable_q & CLK; + +endmodule From bccdd2c1373ad9d786186640722d3725b57550a0 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Tue, 1 Jun 2021 15:31:07 -0400 Subject: [PATCH 14/19] Updates to muldiv.sv for 32-bit div/rem --- wally-pipelined/config/rv64ic/wally-config.vh | 2 +- wally-pipelined/src/muldiv/muldiv.sv | 2 +- wally-pipelined/testbench/testbench-imperas.sv | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/wally-pipelined/config/rv64ic/wally-config.vh b/wally-pipelined/config/rv64ic/wally-config.vh index 259e41ae6..12d254ba8 100644 --- a/wally-pipelined/config/rv64ic/wally-config.vh +++ b/wally-pipelined/config/rv64ic/wally-config.vh @@ -31,7 +31,7 @@ `define XLEN 64 // MISA RISC-V configuration per specification -`define MISA (32'h00000104 | 0 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0) +`define MISA (32'h00000104 | 0 << 5 | 0 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0) `define A_SUPPORTED ((`MISA >> 0) % 2 == 1) `define C_SUPPORTED ((`MISA >> 2) % 2 == 1) `define D_SUPPORTED ((`MISA >> 3) % 2 == 1) diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv index 0c26a5df8..e10b0c55d 100644 --- a/wally-pipelined/src/muldiv/muldiv.sv +++ b/wally-pipelined/src/muldiv/muldiv.sv @@ -76,7 +76,7 @@ module muldiv ( assign Den0 = W64E ? {{32{SrcBE[31]&signedDivide}}, SrcBE[31:0]} : SrcBE; end else begin // RV32 has no W-type instructions assign Num0 = SrcAE; - assign Den0 = SrcAE; + assign Den0 = SrcBE; end // capture the Numerator/Denominator diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index c60aa40db..dabc6d12b 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -320,11 +320,11 @@ string tests32f[] = '{ "rv32m/I-MUL-01", "2000", "rv32m/I-MULH-01", "2000", "rv32m/I-MULHSU-01", "2000", - "rv32m/I-MULHU-01", "2000" - //"rv32m/I-DIV-01", "2000", - //"rv32m/I-DIVU-01", "2000", - //"rv32m/I-REM-01", "2000", - //"rv32m/I-REMU-01", "2000" + "rv32m/I-MULHU-01", "2000", + "rv32m/I-DIV-01", "2000", + "rv32m/I-DIVU-01", "2000", + "rv32m/I-REM-01", "2000", + "rv32m/I-REMU-01", "2000" }; string tests32ic[] = '{ From 2c140679e30d52daf68daf06aa323e09e9a2f2f7 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Tue, 1 Jun 2021 15:45:32 -0400 Subject: [PATCH 15/19] Minor cosmetic update to fpu.sv --- wally-pipelined/src/fpu/fpu.sv | 958 +++++++++++++++------------------ 1 file changed, 439 insertions(+), 519 deletions(-) diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv index 8362dbe33..e303f2055 100755 --- a/wally-pipelined/src/fpu/fpu.sv +++ b/wally-pipelined/src/fpu/fpu.sv @@ -25,535 +25,455 @@ `include "wally-config.vh" module fpu ( - input logic [2:0] FRM_REGW, // Rounding mode from CSR - input logic reset, + input logic [2:0] FRM_REGW, // Rounding mode from CSR + input logic reset, //input logic clear, // *** not being used anywhere - input logic clk, - input logic [31:0] InstrD, - input logic [`XLEN-1:0] SrcAE, // Integer input being processed - input logic [`XLEN-1:0] SrcAM, // Integer input being written into fpreg - input logic StallE, StallM, StallW, - input logic FlushE, FlushM, FlushW, - input logic [`AHBW-1:0] HRDATA, - input logic RegWriteD, - output logic [4:0] SetFflagsM, - output logic [31:0] FSROutW, - output logic [1:0] FMemRWM, - output logic FStallD, - output logic FWriteIntE, FWriteIntM, FWriteIntW, + input logic clk, + input logic [31:0] InstrD, + input logic [`XLEN-1:0] SrcAE, // Integer input being processed + input logic [`XLEN-1:0] SrcAM, // Integer input being written into fpreg + input logic StallE, StallM, StallW, + input logic FlushE, FlushM, FlushW, + input logic [`AHBW-1:0] HRDATA, + input logic RegWriteD, + output logic [4:0] SetFflagsM, + output logic [31:0] FSROutW, + output logic [1:0] FMemRWM, + output logic FStallD, + output logic FWriteIntE, FWriteIntM, FWriteIntW, output logic [`XLEN-1:0] FWriteDataM, - output logic FDivSqrtDoneM, - output logic IllegalFPUInstrD, + output logic FDivSqrtDoneM, + output logic IllegalFPUInstrD, output logic [`XLEN-1:0] FPUResultW); - - - - - //control logic signal instantiation - logic FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW; // FP register write enable - logic [2:0] FrmD, FrmE, FrmM, FrmW; // FP rounding mode - logic FmtD, FmtE, FmtM, FmtW; // FP precision 0-single 1-double - logic FDivStartD, FDivStartE; // Start division - logic FWriteIntD; // Write to integer register - logic FOutputInput2D, FOutputInput2E; // Put Input2 in Input1 if a store instruction - logic [1:0] FMemRWD, FMemRWE; // Read and write enable for memory - logic [1:0] FForwardInput1D, FForwardInput1E; // Input1 forwarding mux control signal - logic [1:0] FForwardInput2D, FForwardInput2E; // Input2 forwarding mux control signal - logic FForwardInput3D, FForwardInput3E; // Input3 forwarding mux control signal - logic FInput2UsedD; // Is input 2 used - logic FInput3UsedD; // Is input 3 used - logic [2:0] FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select FP result - logic [3:0] FOpCtrlD, FOpCtrlE, FOpCtrlM; // Select which opperation to do in each component - - // regfile signals - logic [4:0] RdE, RdM, RdW; // ***Can take from ieu - logic [`XLEN-1:0] FWDM; // Write data for FP register - logic [`XLEN-1:0] FRD1D, FRD2D, FRD3D; // Read Data from FP register - logic [`XLEN-1:0] FRD1E, FRD2E, FRD3E; - logic [`XLEN-1:0] FInput1E, FInput1M, FInput1tmpE; - logic [`XLEN-1:0] FInput2E, FInput2M; - logic [`XLEN-1:0] FInput3E, FInput3M; - logic [`XLEN-1:0] FLoadStoreResultM, FLoadStoreResultW; // Result for load, store, and move to int-reg instructions - - // div/sqrt signals - logic DivDenormM, DivDenormW; - logic DivOvEn, DivUnEn; - logic DivBusyM; - logic [63:0] FDivResultM, FDivResultW; - logic [4:0] FDivFlagsM, FDivFlagsW; - - // FMA signals - logic [12:0] aligncntE, aligncntM; - logic [105:0] rE, rM; - logic [105:0] sE, sM; - logic [163:0] tE, tM; - logic [8:0] normcntE, normcntM; - logic [12:0] aeE, aeM; - logic bsE, bsM; - logic killprodE, killprodM; - logic prodofE, prodofM; - logic xzeroE, xzeroM; - logic yzeroE, yzeroM; - logic zzeroE, zzeroM; - logic xdenormE, xdenormM; - logic ydenormE, ydenormM; - logic zdenormE, zdenormM; - logic xinfE, xinfM; - logic yinfE, yinfM; - logic zinfE, zinfM; - logic xnanE, xnanM; - logic ynanE, ynanM; - logic znanE, znanM; - logic nanE, nanM; - logic [8:0] sumshiftE, sumshiftM; - logic sumshiftzeroE, sumshiftzeroM; - logic prodinfE, prodinfM; - logic [63:0] FmaResultM, FmaResultW; - logic [4:0] FmaFlagsM, FmaFlagsW; - - // add/cvt signals - logic [63:0] AddSumE, AddSumTcE; - logic [3:0] AddSelInvE; - logic [10:0] AddExpPostSumE; - logic AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE; - logic AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE; - logic AddConvertE; - logic [63:0] AddFloat1E, AddFloat2E; - logic [11:0] AddExp1DenormE, AddExp2DenormE; - logic [10:0] AddExponentE; - logic [2:0] AddRmE; - logic [3:0] AddOpTypeE; - logic AddPE, AddOvEnE, AddUnEnE; - logic AddDenormM; - logic [63:0] AddSumM, AddSumTcM; - logic [3:0] AddSelInvM; - logic [10:0] AddExpPostSumM; - logic AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM; - logic AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM; - logic AddConvertM, AddSignM; - logic [63:0] AddFloat1M, AddFloat2M; - logic [11:0] AddExp1DenormM, AddExp2DenormM; - logic [10:0] AddExponentM; - logic [63:0] AddOp1M, AddOp2M; - logic [2:0] AddRmM; - logic [3:0] AddOpTypeM; - logic AddPM, AddOvEnM, AddUnEnM; - logic [63:0] FAddResultM, FAddResultW; - logic [4:0] FAddFlagsM, FAddFlagsW; - - //cmp signals - logic [7:0] WE, WM; - logic [7:0] XE, XM; - logic ANaNE, ANaNM; - logic BNaNE, BNaNM; - logic AzeroE, AzeroM; - logic BzeroE, BzeroM; - logic CmpInvalidM, CmpInvalidW; - logic [1:0] CmpFCCM, CmpFCCW; - logic [63:0] FCmpResultM, FCmpResultW; - - // fsgn signals - logic [63:0] SgnResultE, SgnResultM, SgnResultW; - logic [4:0] SgnFlagsE, SgnFlagsM, SgnFlagsW; - - //instantiation of W stage regfile signals - logic [`XLEN-1:0] SrcAW; - - // classify signals - logic [63:0] ClassResultE, ClassResultM, ClassResultW; - - // other - logic [63:0] FPUResult64W, FPUResult64E; // 64-bit FPU result - logic [4:0] FPUFlagsW; - - // pipeline control logic - logic PipeEnableDE; - logic PipeEnableEM; - logic PipeEnableMW; - logic PipeClearDE; - logic PipeClearEM; - logic PipeClearMW; - - //temporarily assign pipe clear and enable signals - //to never flush & always be running - localparam PipeClear = 1'b0; - localparam PipeEnable = 1'b1; - always_comb begin - - PipeEnableDE = ~StallE; - PipeEnableEM = ~StallM; - PipeEnableMW = ~StallW; - PipeClearDE = FlushE; - PipeClearEM = FlushM; - PipeClearMW = FlushW; - - end - - - - - - - - - - - - - - //DECODE STAGE - - //Hazard unit for FPU - fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*); - - //top-level controller for FPU - fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*); - - - //regfile instantiation + // control logic signal instantiation + logic FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW; // FP register write enable + logic [2:0] FrmD, FrmE, FrmM, FrmW; // FP rounding mode + logic FmtD, FmtE, FmtM, FmtW; // FP precision 0-single 1-double + logic FDivStartD, FDivStartE; // Start division + logic FWriteIntD; // Write to integer register + logic FOutputInput2D, FOutputInput2E; // Put Input2 in Input1 if a store instruction + logic [1:0] FMemRWD, FMemRWE; // Read and write enable for memory + logic [1:0] FForwardInput1D, FForwardInput1E; // Input1 forwarding mux control signal + logic [1:0] FForwardInput2D, FForwardInput2E; // Input2 forwarding mux control signal + logic FForwardInput3D, FForwardInput3E; // Input3 forwarding mux control signal + logic FInput2UsedD; // Is input 2 used + logic FInput3UsedD; // Is input 3 used + logic [2:0] FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select FP result + logic [3:0] FOpCtrlD, FOpCtrlE, FOpCtrlM; // Select which opperation to do in each component + + // regfile signals + logic [4:0] RdE, RdM, RdW; // ***Can take from ieu + logic [`XLEN-1:0] FWDM; // Write data for FP register + logic [`XLEN-1:0] FRD1D, FRD2D, FRD3D; // Read Data from FP register + logic [`XLEN-1:0] FRD1E, FRD2E, FRD3E; + logic [`XLEN-1:0] FInput1E, FInput1M, FInput1tmpE; + logic [`XLEN-1:0] FInput2E, FInput2M; + logic [`XLEN-1:0] FInput3E, FInput3M; + logic [`XLEN-1:0] FLoadStoreResultM, FLoadStoreResultW; // Result for load, store, and move to int-reg instructions + + // div/sqrt signals + logic DivDenormM, DivDenormW; + logic DivOvEn, DivUnEn; + logic DivBusyM; + logic [63:0] FDivResultM, FDivResultW; + logic [4:0] FDivFlagsM, FDivFlagsW; + + // FMA signals + logic [12:0] aligncntE, aligncntM; + logic [105:0] rE, rM; + logic [105:0] sE, sM; + logic [163:0] tE, tM; + logic [8:0] normcntE, normcntM; + logic [12:0] aeE, aeM; + logic bsE, bsM; + logic killprodE, killprodM; + logic prodofE, prodofM; + logic xzeroE, xzeroM; + logic yzeroE, yzeroM; + logic zzeroE, zzeroM; + logic xdenormE, xdenormM; + logic ydenormE, ydenormM; + logic zdenormE, zdenormM; + logic xinfE, xinfM; + logic yinfE, yinfM; + logic zinfE, zinfM; + logic xnanE, xnanM; + logic ynanE, ynanM; + logic znanE, znanM; + logic nanE, nanM; + logic [8:0] sumshiftE, sumshiftM; + logic sumshiftzeroE, sumshiftzeroM; + logic prodinfE, prodinfM; + logic [63:0] FmaResultM, FmaResultW; + logic [4:0] FmaFlagsM, FmaFlagsW; + + // add/cvt signals + logic [63:0] AddSumE, AddSumTcE; + logic [3:0] AddSelInvE; + logic [10:0] AddExpPostSumE; + logic AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE; + logic AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE; + logic AddConvertE; + logic [63:0] AddFloat1E, AddFloat2E; + logic [11:0] AddExp1DenormE, AddExp2DenormE; + logic [10:0] AddExponentE; + logic [2:0] AddRmE; + logic [3:0] AddOpTypeE; + logic AddPE, AddOvEnE, AddUnEnE; + logic AddDenormM; + logic [63:0] AddSumM, AddSumTcM; + logic [3:0] AddSelInvM; + logic [10:0] AddExpPostSumM; + logic AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM; + logic AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM; + logic AddConvertM, AddSignM; + logic [63:0] AddFloat1M, AddFloat2M; + logic [11:0] AddExp1DenormM, AddExp2DenormM; + logic [10:0] AddExponentM; + logic [63:0] AddOp1M, AddOp2M; + logic [2:0] AddRmM; + logic [3:0] AddOpTypeM; + logic AddPM, AddOvEnM, AddUnEnM; + logic [63:0] FAddResultM, FAddResultW; + logic [4:0] FAddFlagsM, FAddFlagsW; + + // cmp signals + logic [7:0] WE, WM; + logic [7:0] XE, XM; + logic ANaNE, ANaNM; + logic BNaNE, BNaNM; + logic AzeroE, AzeroM; + logic BzeroE, BzeroM; + logic CmpInvalidM, CmpInvalidW; + logic [1:0] CmpFCCM, CmpFCCW; + logic [63:0] FCmpResultM, FCmpResultW; + + // fsgn signals + logic [63:0] SgnResultE, SgnResultM, SgnResultW; + logic [4:0] SgnFlagsE, SgnFlagsM, SgnFlagsW; + + // instantiation of W stage regfile signals + logic [`XLEN-1:0] SrcAW; + + // classify signals + logic [63:0] ClassResultE, ClassResultM, ClassResultW; + + // 64-bit FPU result + logic [63:0] FPUResult64W, FPUResult64E; + logic [4:0] FPUFlagsW; + + // pipeline control logic + logic PipeEnableDE; + logic PipeEnableEM; + logic PipeEnableMW; + logic PipeClearDE; + logic PipeClearEM; + logic PipeClearMW; + + // temporarily assign pipe clear and enable signals + // to never flush & always be running + localparam PipeClear = 1'b0; + localparam PipeEnable = 1'b1; + always_comb begin + PipeEnableDE = ~StallE; + PipeEnableEM = ~StallM; + PipeEnableMW = ~StallW; + PipeClearDE = FlushE; + PipeClearEM = FlushM; + PipeClearMW = FlushW; + end + + //DECODE STAGE + + // Hazard unit for FPU + fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*); + + // top-level controller for FPU + fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*); + + // regfile instantiation FPregfile fpregfile (clk, reset, FWriteEnW, InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW, FPUResult64W, FRD1D, FRD2D, FRD3D); - - - - - - - - - - //***************** - //fpregfile D/E pipe registers - //***************** - flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E); - flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E); - flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E); - - //***************** - //other D/E pipe registers - //***************** - flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE); - flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE); - flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE); - flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE); - flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE); - flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE); - flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE); - flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E); - flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E); - flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E); - flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E); - flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE); - flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E); - flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE); - - - - - - - - - - - - - - //EXECUTION STAGE - - - - // input muxs for forwarding - mux4 #(64) FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, SrcAM, FForwardInput1E, FInput1tmpE); - mux3 #(64) FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E); - mux2 #(64) FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E); - mux2 #(64) FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E); - - fma1 fma1 (.*); - - //first and only instance of floating-point divider - logic fpdivClk; - - clockgater fpdivclkg(.E(FDivStartE), - .SE(DivBusyM), - .CLK(clk), - .ECLK(fpdivClk)); - - fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk)); - - //first of two-stage instance of floating-point add/cvt unit - fpuaddcvt1 fpadd1 (.*); - - //first of two-stage instance of floating-point comparator - fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]); - - //first and only instance of floating-point sign converter - fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*); - - //first and only instance of floating-point classify unit - fpuclassify fpuclass (.*); - - - - - - - - - - - - - - - - - //***************** - //fpregfile D/E pipe registers - //***************** - flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M); - flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M); - flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M); - - //***************** - //fma E/M pipe registers - //***************** - flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); - flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); - flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); - flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); - flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); - flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM); - flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); - flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); - flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); - flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); - flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); - flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); - flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); - flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); - flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); - flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); - flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); - flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); - flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); - flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); - flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); - flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); - flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); - flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); - flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); - - //***************** - //fpadd E/M pipe registers - //***************** - flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); - flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); - flopenrc #(4) EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); - flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); - flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); - flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); - flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); - flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); - flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); - flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); - flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); - flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); - flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); - flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); - flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); - flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); - flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); - flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); - flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); - flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); - flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); - flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); - flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); - flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); - flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); - - //***************** - //fpcmp E/M pipe registers - //***************** - flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); - flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); - flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); - flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); - flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); - flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); - - //put this in for the event we want to delay fsgn - will otherwise bypass - //***************** - //fpsgn E/M pipe registers - //***************** - flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM); - flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM); - - //***************** - //other E/M pipe registers - //***************** - flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM); - flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM); - flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM); - flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM); - flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM); - flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM); - flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM); - flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM); - - //***************** - //fpuclassify E/M pipe registers - //***************** - flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM); - - - - - - - - - //BEGIN MEMORY STAGE - - assign FWriteDataM = FInput1M; - - mux2 #(64) FLoadStoreResultMux(HRDATA, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM); - - fma2 fma2(.*); - - //second instance of two-stage floating-point add/cvt unit - fpuaddcvt2 fpadd2 (.*); - - //second instance of two-stage floating-point comparator - fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*); - - - - - - - - - - - - //***************** - //fma M/W pipe registers - //***************** - flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); - flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); - - //***************** - //fpdiv M/W pipe registers - //***************** - flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); - flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW); - flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); - - //***************** - //fpadd M/W pipe registers - //***************** - flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); - flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); - - //***************** - //fpcmp M/W pipe registers - //***************** - flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); - flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); - flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); - - //***************** - //fpsgn M/W pipe registers - //***************** - flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW); - flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW); - - //***************** - //other M/W pipe registers - //***************** - flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW); - flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW); - flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW); - flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW); - flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW); - flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW); - flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW); - - - //***************** - //fpuclassify M/W pipe registers - //***************** - flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW); - - - - - - - + + //***************** + // fpregfile D/E pipe registers + //***************** + flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E); + flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E); + flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E); + + //***************** + // other D/E pipe registers + //***************** + flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE); + flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE); + flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE); + flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE); + flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE); + flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE); + flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE); + flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E); + flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E); + flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E); + flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E); + flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE); + flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E); + flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE); + + //EXECUTION STAGE + + // input muxs for forwarding + mux4 #(64) FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, SrcAM, FForwardInput1E, FInput1tmpE); + mux3 #(64) FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E); + mux2 #(64) FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E); + mux2 #(64) FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E); + + fma1 fma1 (.*); + + // first and only instance of floating-point divider + logic fpdivClk; + + clockgater fpdivclkg(.E(FDivStartE), + .SE(DivBusyM), + .CLK(clk), + .ECLK(fpdivClk)); + + fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk)); + + // first of two-stage instance of floating-point add/cvt unit + fpuaddcvt1 fpadd1 (.*); + + // first of two-stage instance of floating-point comparator + fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]); + + // first and only instance of floating-point sign converter + fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*); + + // first and only instance of floating-point classify unit + fpuclassify fpuclass (.*); + + //***************** + //fpregfile D/E pipe registers + //***************** + flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M); + flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M); + flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M); + + //***************** + // fma E/M pipe registers + //***************** + flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); + flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); + flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); + flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); + flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); + flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM); + flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); + flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); + flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); + flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); + flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); + flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); + flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); + flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); + flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); + flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); + flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); + flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); + flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); + flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); + flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); + flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); + flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); + flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); + flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); + + //***************** + // fpadd E/M pipe registers + //***************** + flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); + flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); + flopenrc #(4) EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); + flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); + flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); + flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); + flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); + flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); + flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); + flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); + flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); + flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); + flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); + flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); + flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); + flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); + flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); + flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); + flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); + flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); + flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); + flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); + flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); + flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); + flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); + + //***************** + // fpcmp E/M pipe registers + //***************** + flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); + flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); + flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); + flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); + flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); + flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); + + // put this in for the event we want to delay fsgn - will otherwise bypass + //***************** + // fpsgn E/M pipe registers + //***************** + flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM); + flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM); + + //***************** + // other E/M pipe registers + //***************** + flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM); + flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM); + flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM); + flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM); + flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM); + flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM); + flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM); + flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM); + + //***************** + // fpuclassify E/M pipe registers + //***************** + flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM); + + //BEGIN MEMORY STAGE + + assign FWriteDataM = FInput1M; + + mux2 #(64) FLoadStoreResultMux(HRDATA, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM); + + fma2 fma2(.*); + + // second instance of two-stage floating-point add/cvt unit + fpuaddcvt2 fpadd2 (.*); + + // second instance of two-stage floating-point comparator + fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), + .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*); + + //***************** + // fma M/W pipe registers + //***************** + flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); + flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); + + //***************** + // fpdiv M/W pipe registers + //***************** + flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); + flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW); + flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); + + //***************** + // fpadd M/W pipe registers + //***************** + flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); + flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); + + //***************** + // fpcmp M/W pipe registers + //***************** + flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); + flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); + flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); + + //***************** + // fpsgn M/W pipe registers + //***************** + flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW); + flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW); + + //***************** + // other M/W pipe registers + //***************** + flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW); + flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW); + flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW); + flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW); + flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW); + flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW); + flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW); + + //***************** + // fpuclassify M/W pipe registers + //***************** + flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW); //######################################### - //BEGIN WRITEBACK STAGE + // BEGIN WRITEBACK STAGE //######################################### - - always_comb begin - case (FResultSelW) - // div/sqrt - 3'b000 : FPUFlagsW = FDivFlagsW; - // cmp - 3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0}; - //fma/mult - 3'b010 : FPUFlagsW = FmaFlagsW; - // sgn inj - 3'b011 : FPUFlagsW = SgnFlagsW; - // add/sub/cnvt - 3'b100 : FPUFlagsW = FAddFlagsW; - // classify - 3'b101 : FPUFlagsW = 5'b0; - // output SrcAW - 3'b110 : FPUFlagsW = 5'b0; - // output FRD1 - 3'b111 : FPUFlagsW = 5'b0; - default : FPUFlagsW = 5'bxxxxx; - endcase - end - - - always_comb begin - case (FResultSelW) - // div/sqrt - 3'b000 : FPUResult64W = FDivResultW; - // cmp - 3'b001 : FPUResult64W = FCmpResultW; - //fma/mult - 3'b010 : FPUResult64W = FmaResultW; - // sgn inj - 3'b011 : FPUResult64W = SgnResultW; - // add/sub/cnvt - 3'b100 : FPUResult64W = FAddResultW; - // classify - 3'b101 : FPUResult64W = ClassResultW; - // output SrcAW - 3'b110 : FPUResult64W = SrcAW; - // Load/Store/Move to FP-register - 3'b111 : FPUResult64W = FLoadStoreResultW; - default : FPUResult64W = {64{1'bx}}; - endcase - end - //interface between XLEN size datapath and double-precision sized - //floating-point results - // - //define offsets for LSB zero extension or truncation - always_comb begin - - //zero extension + + always_comb begin + case (FResultSelW) + // div/sqrt + 3'b000 : FPUFlagsW = FDivFlagsW; + // cmp + 3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0}; + //fma/mult + 3'b010 : FPUFlagsW = FmaFlagsW; + // sgn inj + 3'b011 : FPUFlagsW = SgnFlagsW; + // add/sub/cnvt + 3'b100 : FPUFlagsW = FAddFlagsW; + // classify + 3'b101 : FPUFlagsW = 5'b0; + // output SrcAW + 3'b110 : FPUFlagsW = 5'b0; + // output FRD1 + 3'b111 : FPUFlagsW = 5'b0; + default : FPUFlagsW = 5'bxxxxx; + endcase + end + + always_comb begin + case (FResultSelW) + // div/sqrt + 3'b000 : FPUResult64W = FDivResultW; + // cmp + 3'b001 : FPUResult64W = FCmpResultW; + //fma/mult + 3'b010 : FPUResult64W = FmaResultW; + // sgn inj + 3'b011 : FPUResult64W = SgnResultW; + // add/sub/cnvt + 3'b100 : FPUResult64W = FAddResultW; + // classify + 3'b101 : FPUResult64W = ClassResultW; + // output SrcAW + 3'b110 : FPUResult64W = SrcAW; + // Load/Store/Move to FP-register + 3'b111 : FPUResult64W = FLoadStoreResultW; + default : FPUResult64W = {64{1'bx}}; + endcase + end // always_comb + + // interface between XLEN size datapath and double-precision sized + // floating-point results + // + // define offsets for LSB zero extension or truncation + always_comb begin + // zero extension FPUResultW = FPUResult64W[63:64-`XLEN]; - SetFflagsM = FPUFlagsW; + SetFflagsM = FPUFlagsW; + end + +endmodule // fpu - end -endmodule From 7afbd8d877c4397961684b143af7ae420209e1b1 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 1 Jun 2021 15:05:22 -0500 Subject: [PATCH 16/19] The clock gater was not implemented correctly. Now it is level sensitive to a low clock. --- wally-pipelined/src/generic/clockgater.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wally-pipelined/src/generic/clockgater.sv b/wally-pipelined/src/generic/clockgater.sv index dc51829da..c06a1cbdc 100644 --- a/wally-pipelined/src/generic/clockgater.sv +++ b/wally-pipelined/src/generic/clockgater.sv @@ -38,7 +38,7 @@ module clockgater logic enable_q; - always @(E or SE) begin + always @(~CLK) begin enable_q <= E | SE; end assign ECLK = enable_q & CLK; From 7f5e5287b0fd594ae379f3866f3080e47b2fdca9 Mon Sep 17 00:00:00 2001 From: "James E. Stine" Date: Tue, 1 Jun 2021 17:39:54 -0400 Subject: [PATCH 17/19] delete div.bak --- wally-pipelined/src/muldiv/div.bak | 1560 ---------------------------- 1 file changed, 1560 deletions(-) delete mode 100755 wally-pipelined/src/muldiv/div.bak diff --git a/wally-pipelined/src/muldiv/div.bak b/wally-pipelined/src/muldiv/div.bak deleted file mode 100755 index 4266ae61a..000000000 --- a/wally-pipelined/src/muldiv/div.bak +++ /dev/null @@ -1,1560 +0,0 @@ -/////////////////////////////////////////// -// mul.sv -// -// Written: James.Stine@okstate.edu 1 February 2021 -// Modified: -// -// Purpose: Integer Divide instructions -// -// A component of the Wally configurable RISC-V project. -// -// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, -// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software -// is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT -// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -/////////////////////////////////////////// - -// *** I added these verilator controls to clean up the -// lint output. The linter warnings should be fixed, but now the output is at -// least readable. -/* verilator lint_off COMBDLY */ -/* verilator lint_off IMPLICIT */ - -`include "wally-config.vh" - -module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S); - - input logic [63:0] N, D; - input logic clk; - input logic reset; - input logic start; - input logic S; - - output logic [63:0] Qf; - output logic [63:0] remf; - output logic div0; - output logic done; - output logic divBusy; - - logic divdone; - logic enable; - logic state0; - logic V; - logic [7:0] Num; - logic [5:0] P, NumIter, RemShift; - logic [63:0] op1, op2, op1shift, Rem5; - logic [64:0] Qd, Rd, Qd2, Rd2; - logic [63:0] Q, rem0; - logic [3:0] quotient; - logic otfzero; - logic shiftResult; - logic enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp; - - logic [63:0] twoD; - logic [63:0] twoN; - logic SignD; - logic SignN; - logic [63:0] QT, remT; - logic D_NegOne; - logic Max_N; - - // Check if negative (two's complement) - // If so, convert to positive - adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD); - adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN); - assign SignD = D[63]; - assign SignN = N[63]; - // Max N and D = -1 (Overflow) - assign Max_N = (~|N[62:0]) & N[63]; - assign D_NegOne = &D; - - // Divider goes the distance to 37 cycles - // (thanks to the evil divisor for D = 0x1) - - // Shift D, if needed (for integer) - // needed to allow qst to be in range for integer - // division [1,2) and allow integer divide to work. - // - // The V or valid bit can be used to determine if D - // is 0 and thus a divide by 0 exception. This div0 - // exception is given to FSM to tell the operation to - // quit gracefully. - - lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD)); - shift_left #(64) p2 (twoD, P, op2); - assign op1 = twoN; - assign div0 = ~V; - - // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0) - // v = 2 since \rho < 1 (add 4 to make sure its a ceil) - adder #(8) cpa3 ({2'b0, P}, - {5'h0, shiftResult, ~shiftResult, 1'b0}, - Num); - - // Determine whether need to add just Q/Rem - assign shiftResult = P[0]; - // div by 2 (ceil) - assign NumIter = Num[6:1]; - assign RemShift = P; - - // FSM to control integer divider - // assume inputs are postive edge and - // datapath (divider) is negative edge - fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv, - start, div0, NumIter, ~clk, reset); - - flopr #(1) rega (~clk, reset, donev, done); - flopr #(1) regb (~clk, reset, divdonev, divdone); - flopr #(1) regc (~clk, reset, otfzerov, otfzero); - flopr #(1) regd (~clk, reset, enablev, enable); - flopr #(1) rege (~clk, reset, state0v, state0); - flopr #(1) regf (~clk, reset, divBusyv, divBusy); - - // To obtain a correct remainder the last bit of the - // quotient has to be aligned with a radix-r boundary. - // Since the quotient is in the range 1/2 < q < 2 (one - // integer bit and m fractional bits), this is achieved by - // shifting N right by v+s so that (m+v+s) mod k = 0. And, - // the quotient has to be aligned to the integer position. - - divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, - enable, otfzero, shiftResult); - - // Storage registers to hold contents stable - flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2); - flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2); - - // Probably not needed - just assigns results - assign Q = Qd2[63:0]; - assign Rem5 = Rd2[64:1]; - - // Adjust remainder by m - shift_right #(64) p4 (Rem5, RemShift, rem0); - - // Adjust Q/Rem for Signed - assign tcQ = (SignN ^ SignD) & S; - assign tcR = SignN & S; - // Signed Divide - // - When N and D are negative: Remainder is negative (undergoes a two's complement). - // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement). - // - When D is negative: Quotient is negative (undergoes a two's complement). - adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT); - adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT); - - // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec) - exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf); - -endmodule // int32div - -module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, - enable, otfzero, shiftResult); - - input logic [63:0] op1, op2; - input logic clk, state0; - input logic reset; - input logic enable; - input logic otfzero; - input logic shiftResult; - - output logic [64:0] rem0; - output logic [64:0] Q; - output logic [3:0] quotient; - - logic [67:0] Sum, Carry; - logic [64:0] Qstar; - logic [64:0] QMstar; - logic [7:0] qtotal; - logic [67:0] SumN, CarryN, SumN2, CarryN2; - logic [67:0] divi1, divi2, divi1c, divi2c, dive1; - logic [67:0] mdivi_temp, mdivi; - logic zero; - logic [1:0] qsel; - logic [1:0] Qin, QMin; - logic CshiftQ, CshiftQM; - logic [67:0] rem1, rem2, rem3; - logic [67:0] SumR, CarryR; - logic [64:0] Qt; - - // Create one's complement values of Divisor (for q*D) - assign divi1 = {3'h0, op2, 1'b0}; - assign divi2 = {2'h0, op2, 2'b0}; - assign divi1c = ~divi1; - assign divi2c = ~divi2; - // Shift x1 if not mod k - mux2 #(68) mx1 ({3'b000, op1, 1'b0}, {4'h0, op1}, shiftResult, dive1); - - // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D) - mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN); - mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN); - // Simplify QST - adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal); - // q = {+2, +1, -1, -2} else q = 0 - qst4 pd1 (qtotal[7:1], divi1[63:61], quotient); - assign ulp = quotient[2]|quotient[3]; - assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]); - // Map to binary encoding - assign qsel[1] = quotient[3]|quotient[2]; - assign qsel[0] = quotient[3]|quotient[1]; - mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp); - mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi); - csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry); - // regs : save CSA - flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2); - flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2); - // OTF - ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM); - otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, - otfzero, enable, Qstar, QMstar); - - // Correction and generation of Remainder - adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1); - // Add back +D as correction - csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR); - adder #(68) cpa3 (SumR, CarryR, rem2); - // Choose remainder (Rem or Rem+D) - mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3); - // Choose correct Q or QM - mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt); - // Final results - assign rem0 = rem3[64:0]; - assign Q = Qt; - -endmodule // divide4x64 - -module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM); - - input logic [3:0] quot; - - output logic [1:0] Qin; - output logic [1:0] QMin; - output logic CshiftQ; - output logic CshiftQM; - - // Load/Store Control for OTF - assign Qin[1] = (quot[1]) | (quot[3]) | (quot[0]); - assign Qin[0] = (quot[1]) | (quot[2]); - assign QMin[1] = (quot[1]) | (!quot[3]&!quot[2]&!quot[1]&!quot[0]); - assign QMin[0] = (quot[3]) | (quot[0]) | - (!quot[3]&!quot[2]&!quot[1]&!quot[0]); - assign CshiftQ = (quot[1]) | (quot[0]); - assign CshiftQM = (quot[3]) | (quot[2]); - -endmodule - -// On-the-fly Conversion per Ercegovac/Lang - -module otf #(parameter WIDTH=8) - (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q); - - input logic [1:0] Qin, QMin; - input logic CshiftQ, CshiftQM; - input logic clk; - input logic reset; - input logic enable; - - output logic [WIDTH-1:0] R2Q; - output logic [WIDTH-1:0] R1Q; - - logic [WIDTH-1:0] Qstar, QMstar; - logic [WIDTH-1:0] M1Q, M2Q; - - // QM - mux2 #(WIDTH) m1 (QMstar, Qstar, CshiftQM, M1Q); - flopenr #(WIDTH) r1 (clk, reset, enable, {M1Q[WIDTH-3:0], QMin}, R1Q); - // Q - mux2 #(WIDTH) m2 (Qstar, QMstar, CshiftQ, M2Q); - flopenr #(WIDTH) r2 (clk, reset, enable, {M2Q[WIDTH-3:0], Qin}, R2Q); - - assign Qstar = R2Q; - assign QMstar = R1Q; - -endmodule // otf8 - -module adder #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, - output logic [WIDTH-1:0] y); - - assign y = a + b; - -endmodule // adder - -module fa (input logic a, b, c, output logic sum, carry); - - assign sum = a^b^c; - assign carry = a&b|a&c|b&c; - -endmodule // fa - -module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c, - output logic [WIDTH-1:0] sum, carry); - - logic [WIDTH:0] carry_temp; - genvar i; - generate - for (i=0;i B. LT and GT are both '0' if A = B. - -module magcompare2b (LT, GT, A, B); - - input logic [1:0] A; - input logic [1:0] B; - - output logic LT; - output logic GT; - - // Determine if A < B using a minimized sum-of-products expression - assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0]; - // Determine if A > B using a minimized sum-of-products expression - assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0]; - -endmodule // magcompare2b - -// J. E. Stine and M. J. Schulte, "A combined two's complement and -// floating-point comparator," 2005 IEEE International Symposium on -// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. -// doi: 10.1109/ISCAS.2005.1464531 - -module magcompare8 (LT, EQ, A, B); - - input logic [7:0] A; - input logic [7:0] B; - - logic [3:0] s; - logic [3:0] t; - logic [1:0] u; - logic [1:0] v; - logic GT; - //wire LT; - - output logic EQ; - output logic LT; - - magcompare2b mag1 (s[0], t[0], A[1:0], B[1:0]); - magcompare2b mag2 (s[1], t[1], A[3:2], B[3:2]); - magcompare2b mag3 (s[2], t[2], A[5:4], B[5:4]); - magcompare2b mag4 (s[3], t[3], A[7:6], B[7:6]); - - magcompare2b mag5 (u[0], v[0], t[1:0], s[1:0]); - magcompare2b mag6 (u[1], v[1], t[3:2], s[3:2]); - - magcompare2b mag7 (LT, GT, v[1:0], u[1:0]); - - assign EQ = ~(GT | LT); - -endmodule // magcompare8 - -module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf); - - input logic [63:0] Q; - input logic [63:0] rem; - input logic [63:0] op1; - input logic S; - input logic div0; - input logic Max_N; - input logic D_NegOne; - - output logic [63:0] Qf; - output logic [63:0] remf; - - // Needs to be optimized - always_comb - case ({div0, S, Max_N, D_NegOne}) - 4'b0000 : Qf = Q; - 4'b0001 : Qf = Q; - 4'b0010 : Qf = Q; - 4'b0011 : Qf = Q; - 4'b0100 : Qf = Q; - 4'b0101 : Qf = Q; - 4'b0110 : Qf = Q; - 4'b0111 : Qf = {1'b1, 31'h0}; - 4'b1000 : Qf = {64{1'b1}}; - 4'b1001 : Qf = {64{1'b1}}; - 4'b1010 : Qf = {64{1'b1}}; - 4'b1011 : Qf = {64{1'b1}}; - 4'b1100 : Qf = {64{1'b1}}; - 4'b1101 : Qf = {64{1'b1}}; - 4'b1110 : Qf = {64{1'b1}}; - 4'b1111 : Qf = {64{1'b1}}; - default: Qf = Q; - endcase - - always_comb - case ({div0, S, Max_N, D_NegOne}) - 4'b0000 : remf = rem; - 4'b0001 : remf = rem; - 4'b0010 : remf = rem; - 4'b0011 : remf = rem; - 4'b0100 : remf = rem; - 4'b0101 : remf = rem; - 4'b0110 : remf = rem; - 4'b0111 : remf = 64'h0; - 4'b1000 : remf = op1; - 4'b1001 : remf = op1; - 4'b1010 : remf = op1; - 4'b1011 : remf = op1; - 4'b1100 : remf = op1; - 4'b1101 : remf = op1; - 4'b1110 : remf = op1; - 4'b1111 : remf = op1; - default: remf = rem; - endcase - -endmodule // exception_int - -/* verilator lint_on COMBDLY */ -/* verilator lint_on IMPLICIT */ - From 06cf3a84036f7d1c8d59be0a5b2f5bc19923d87e Mon Sep 17 00:00:00 2001 From: Kip Macsai-Goren Date: Tue, 1 Jun 2021 17:49:45 -0400 Subject: [PATCH 18/19] Edited and added constants to support SV48 --- .../config/buildroot/wally-constants.vh | 26 +++++++++++++------ .../config/busybear/wally-constants.vh | 26 +++++++++++++------ .../config/coremark/wally-constants.vh | 26 +++++++++++++------ .../config/coremark_bare/wally-constants.vh | 26 +++++++++++++------ .../config/rv32ic/wally-constants.vh | 12 ++++++++- .../config/rv64BP/wally-constants.vh | 26 +++++++++++++------ .../config/rv64ic/wally-constants.vh | 26 +++++++++++++------ .../config/rv64icfd/wally-constants.vh | 26 +++++++++++++------ .../config/rv64imc/wally-constants.vh | 26 +++++++++++++------ 9 files changed, 155 insertions(+), 65 deletions(-) diff --git a/wally-pipelined/config/buildroot/wally-constants.vh b/wally-pipelined/config/buildroot/wally-constants.vh index 43d958632..cc6c27fc1 100644 --- a/wally-pipelined/config/buildroot/wally-constants.vh +++ b/wally-pipelined/config/buildroot/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/busybear/wally-constants.vh b/wally-pipelined/config/busybear/wally-constants.vh index 43d958632..cc6c27fc1 100644 --- a/wally-pipelined/config/busybear/wally-constants.vh +++ b/wally-pipelined/config/busybear/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/coremark/wally-constants.vh b/wally-pipelined/config/coremark/wally-constants.vh index 43d958632..cc6c27fc1 100644 --- a/wally-pipelined/config/coremark/wally-constants.vh +++ b/wally-pipelined/config/coremark/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/coremark_bare/wally-constants.vh b/wally-pipelined/config/coremark_bare/wally-constants.vh index 43d958632..cc6c27fc1 100644 --- a/wally-pipelined/config/coremark_bare/wally-constants.vh +++ b/wally-pipelined/config/coremark_bare/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/rv32ic/wally-constants.vh b/wally-pipelined/config/rv32ic/wally-constants.vh index ec4a48b4d..f4c5ce9aa 100644 --- a/wally-pipelined/config/rv32ic/wally-constants.vh +++ b/wally-pipelined/config/rv32ic/wally-constants.vh @@ -2,7 +2,10 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 31 May 2021 +// added svmode constants. These aren't strictly necessary since we're just checking one bit, +// but they're here to stay consistent and to make sure we dont wind up +// a "NO_TRANSLATE undefined" situation. // // Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. // These macros should not be changed, except in the event of an @@ -31,3 +34,10 @@ `define PPN_BITS 22 `define PPN_HIGH_SEGMENT_BITS 12 `define PA_BITS 34 +`define SVMODE_BITS 1 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 // These two are only here to stop +`define SV48 9 // the verilator from yelling at me diff --git a/wally-pipelined/config/rv64BP/wally-constants.vh b/wally-pipelined/config/rv64BP/wally-constants.vh index 43d958632..cc6c27fc1 100644 --- a/wally-pipelined/config/rv64BP/wally-constants.vh +++ b/wally-pipelined/config/rv64BP/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/rv64ic/wally-constants.vh b/wally-pipelined/config/rv64ic/wally-constants.vh index 43d958632..cc6c27fc1 100644 --- a/wally-pipelined/config/rv64ic/wally-constants.vh +++ b/wally-pipelined/config/rv64ic/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/rv64icfd/wally-constants.vh b/wally-pipelined/config/rv64icfd/wally-constants.vh index 43d958632..cc6c27fc1 100644 --- a/wally-pipelined/config/rv64icfd/wally-constants.vh +++ b/wally-pipelined/config/rv64icfd/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 diff --git a/wally-pipelined/config/rv64imc/wally-constants.vh b/wally-pipelined/config/rv64imc/wally-constants.vh index 43d958632..cc6c27fc1 100644 --- a/wally-pipelined/config/rv64imc/wally-constants.vh +++ b/wally-pipelined/config/rv64imc/wally-constants.vh @@ -2,11 +2,14 @@ // wally-constants.vh // // Written: tfleming@hmc.edu 4 March 2021 -// Modified: +// Modified: Kmacsaigoren@hmc.edu 31 May 2021 +// Added constants for checking sv mode and changed existing constants to accomodate +// both sv48 and sv39 // -// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture. -// These macros should not be changed, except in the event of an -// update to the architecture or particularly special circumstances. +// Purpose: Specify constants nexessary for different memory virtualization modes. +// These are specific to sv49, defined in section 4.5 of the privileged spec. +// However, despite different constants for different modes, the hardware helps distinguish between +// each mode. // // A component of the Wally configurable RISC-V project. // @@ -25,9 +28,16 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// -// Virtual Memory Constants (sv39) +// Virtual Memory Constants (sv48) `define VPN_SEGMENT_BITS 9 -`define VPN_BITS 27 +`define VPN_BITS 36 +`define PPN_HIGH_SEGMENT_BITS 17 `define PPN_BITS 44 -`define PPN_HIGH_SEGMENT_BITS 26 -`define PA_BITS 56 +`define PA_BITS 56 +`define SVMODE_BITS 4 +// constants to check SATP_MODE against +// defined in Table 4.3 of the privileged spec +`define NO_TRANSLATE 0 +`define SV32 1 +`define SV39 8 +`define SV48 9 From f7deda0514a51d485a43cd6434f613cd12e72bca Mon Sep 17 00:00:00 2001 From: Kip Macsai-Goren Date: Tue, 1 Jun 2021 17:50:37 -0400 Subject: [PATCH 19/19] implemented Sv48. --- wally-pipelined/src/mmu/cam_line.sv | 20 +++-- wally-pipelined/src/mmu/page_number_mixer.sv | 87 +++++++++++++++----- wally-pipelined/src/mmu/pagetablewalker.sv | 60 ++++++++++---- wally-pipelined/src/mmu/tlb.sv | 33 +++++--- wally-pipelined/src/mmu/tlb_cam.sv | 25 +++--- 5 files changed, 160 insertions(+), 65 deletions(-) diff --git a/wally-pipelined/src/mmu/cam_line.sv b/wally-pipelined/src/mmu/cam_line.sv index b75775738..6bab0b60b 100644 --- a/wally-pipelined/src/mmu/cam_line.sv +++ b/wally-pipelined/src/mmu/cam_line.sv @@ -2,7 +2,9 @@ // cam_line.sv // // Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// Implemented SV48 on top of SV39. This included adding SvMode input signal and the wally constants +// Mostly this was done to make the PageNumberMixer work. // // Purpose: CAM line for the translation lookaside buffer (TLB) // Determines whether a virtual address matches the stored key. @@ -24,12 +26,17 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// +`include "wally-constants.vh" + module cam_line #(parameter KEY_BITS = 20, parameter HIGH_SEGMENT_BITS = 10) ( input clk, reset, + // input to scheck which SvMode is running + input [`SVMODE_BITS-1:0] SvMode, + // The requested page number to compare against the key - input [KEY_BITS-1:0] VirtualPageNumber, + input [KEY_BITS-1:0] VirtualPageNumber, // Signals to write a new entry to this line input CAMLineWrite, @@ -38,10 +45,11 @@ module cam_line #(parameter KEY_BITS = 20, // Flush this line (set valid to 0) input TLBFlush, - // This entry is a key for a giga, mega, or kilopage. + // This entry is a key for a tera, giga, mega, or kilopage. // PageType == 2'b00 --> kilopage // PageType == 2'b01 --> megapage - // PageType == 2'b11 --> gigapage + // PageType == 2'b10 --> gigapage + // PageType == 2'b11 --> terapage output [1:0] PageType, // *** should this be the stored version or the always updated one? output Match ); @@ -67,9 +75,9 @@ module cam_line #(parameter KEY_BITS = 20, flopenr #(KEY_BITS) keyflop(clk, reset, CAMLineWrite, VirtualPageNumber, Key); // Calculate the actual query key based on the input key and the page type. - // For example, a megapage in sv39 only cares about VPN2 and VPN1, so VPN0 + // For example, a megapage in SV39 only cares about VPN2 and VPN1, so VPN0 // should automatically match. - page_number_mixer #(KEY_BITS, HIGH_SEGMENT_BITS) mixer(VirtualPageNumber, Key, PageType, VirtualPageNumberQuery); + page_number_mixer #(KEY_BITS, HIGH_SEGMENT_BITS) mixer(VirtualPageNumber, Key, PageType, SvMode, VirtualPageNumberQuery); assign Match = ({1'b1, VirtualPageNumberQuery} == {Valid, Key}); diff --git a/wally-pipelined/src/mmu/page_number_mixer.sv b/wally-pipelined/src/mmu/page_number_mixer.sv index 57b8e4b77..03851018d 100644 --- a/wally-pipelined/src/mmu/page_number_mixer.sv +++ b/wally-pipelined/src/mmu/page_number_mixer.sv @@ -2,7 +2,11 @@ // page_number_mixer.sv // // Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// Implemented SV48 on top of SV39. This included adding a 3rd Segment to each of the pagenumbers, +// Ensuring that the BITS and HIGH_SEGMENT_BITS inputs were correct everywhere this module gets instatniated, +// Adding seveeral muxes to decide the bit selection to turn pagenumbers into segments based on SV mode, +// Adding support for terapage/newgigapage encoding. // // Purpose: Takes two page numbers and replaces segments of the first page // number with segments from the second, based on the page type. @@ -25,22 +29,29 @@ /////////////////////////////////////////// `include "wally-config.vh" +`include "wally-constants.vh" module page_number_mixer #(parameter BITS = 20, parameter HIGH_SEGMENT_BITS = 10) ( - input [BITS-1:0] PageNumber, - input [BITS-1:0] MixPageNumber, - input [1:0] PageType, - output [BITS-1:0] PageNumberCombined + input [BITS-1:0] PageNumber, + input [BITS-1:0] MixPageNumber, + input [1:0] PageType, + input [`SVMODE_BITS-1:0] SvMode, + + output [BITS-1:0] PageNumberCombined ); + // The upper segment might have a different width than the lower segments. + // For example, an SV39 PTE has 26 bits for PPN2 and 9 bits for the other + // segments. This is outside the 'if XLEN' b/c the constant is already configured + // to the correct value for the XLEN in the relevant wally-constants.vh file. + localparam LOW_SEGMENT_BITS = `VPN_SEGMENT_BITS; + // *** each time this module is implemented, low segment bits is either + // `VPN_SEGMENT_BITS or `PPN_LOW_SEGMENT_BITS (if it existed) + // in every mode so far, these are the same, so it's left as it is above. + generate - // *** Just checking XLEN is not enough to support sv39 AND sv48. if (`XLEN == 32) begin - // The upper segment might have a different width than the lower segments. - // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other - // segments. - localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS); logic [HIGH_SEGMENT_BITS-1:0] Segment1, MixSegment1, Segment1Combined; logic [LOW_SEGMENT_BITS-1:0] Segment0, MixSegment0, Segment0Combined; @@ -58,28 +69,60 @@ module page_number_mixer #(parameter BITS = 20, // Reswizzle segments of the combined page number assign PageNumberCombined = {Segment1Combined, Segment0Combined}; end else begin - // The upper segment might have a different width than the lower segments. - // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other - // segments. - localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS) / 2; - logic [HIGH_SEGMENT_BITS-1:0] Segment2, MixSegment2, Segment2Combined; + // After segment 0 and 1 of the page number, the width of each segment is dependant on the SvMode. + // For this reason, each segment bus is the width of its widest value across each mode + // when a smaller value needs to be loaded in to a wider bus, it's loaded in the least significant bits + // and left padded with zeros. MAKE SURE that if a value is being padded with zeros here, + // that it's padded with zeros everywhere else in the MMU ans beyond to avoid false misses in the TLB. + logic [HIGH_SEGMENT_BITS-1:0] Segment3, MixSegment3, Segment3Combined; + logic [HIGH_SEGMENT_BITS + LOW_SEGMENT_BITS-1:0] Segment2, MixSegment2, Segment2Combined; logic [LOW_SEGMENT_BITS-1:0] Segment1, MixSegment1, Segment1Combined; logic [LOW_SEGMENT_BITS-1:0] Segment0, MixSegment0, Segment0Combined; + // Unswizzle segments of the input page number - assign {Segment2, Segment1, Segment0} = PageNumber; - assign {MixSegment2, MixSegment1, MixSegment0} = MixPageNumber; + // *** these muxes assume that only Sv48 and SV39 are implemented in rv64. for future SV57 and up, + // there will have to be more muxes to select which value each segment gets. + // as a cool reminder: BITS is the width of the page number, virt or phys, coming into this module + // while high segment bits is the width of the highest segment of that page number. + // Note for future work: this module has to work with both VPNs and PPNs and due to their differing + // widths and the fact that the ppn has one longer segment at the top makes the muxes below very confusing. + // Potentially very annoying thing for future workers: the number of bits in a ppn is always 44 (for SV39 and48) + // but in SV57 and above, this might be a new longer length. In that case these selectors will most likely + // become even more complicated and confusing. + assign Segment3 = (SvMode == `SV48) ? + PageNumber[BITS-1:3*LOW_SEGMENT_BITS] : // take the top segment or not + {HIGH_SEGMENT_BITS{1'b0}}; // for virtual page numbers in SV39, both options should be zeros. + assign Segment2 = (SvMode == `SV48) ? + {{HIGH_SEGMENT_BITS{1'b0}}, PageNumber[3*LOW_SEGMENT_BITS-1:2*LOW_SEGMENT_BITS]} : // just take another low segment left padded with zeros. + PageNumber[BITS-1:2*LOW_SEGMENT_BITS]; // otherwise take the rest of the PageNumber + assign Segment1 = PageNumber[2*LOW_SEGMENT_BITS-1:LOW_SEGMENT_BITS]; + assign Segment0 = PageNumber[LOW_SEGMENT_BITS-1:0]; + + + assign MixSegment3 = (SvMode == `SV48) ? + MixPageNumber[BITS-1:3*LOW_SEGMENT_BITS] : // take the top segment or not + {HIGH_SEGMENT_BITS{1'b0}}; // for virtual page numbers in SV39, both options should be zeros. + assign MixSegment2 = (SvMode == `SV48) ? + {{HIGH_SEGMENT_BITS{1'b0}}, MixPageNumber[3*LOW_SEGMENT_BITS-1:2*LOW_SEGMENT_BITS]} : // just take another low segment left padded with zeros. + MixPageNumber[BITS-1:2*LOW_SEGMENT_BITS]; // otherwise take the rest of the PageNumber + assign MixSegment1 = MixPageNumber[2*LOW_SEGMENT_BITS-1:LOW_SEGMENT_BITS]; + assign MixSegment0 = MixPageNumber[LOW_SEGMENT_BITS-1:0]; + // Pass through the high segment - assign Segment2Combined = Segment2; + assign Segment3Combined = Segment3; - // Either pass through or zero out segments 1 and 0 based on the page type - mux2 #(LOW_SEGMENT_BITS) segment1mux(Segment1, MixSegment1, PageType[1], Segment1Combined); - mux2 #(LOW_SEGMENT_BITS) segment0mux(Segment0, MixSegment0, PageType[0], Segment0Combined); + // Either pass through or zero out lower segments based on the page type + assign Segment2Combined = (PageType[1] && PageType[0]) ? MixSegment2 : Segment2; // terapage (page == 11) + assign Segment1Combined = (PageType[1]) ? MixSegment1 : Segment1; // gigapage and higher (page == 10 or 11) + assign Segment0Combined = (PageType[1] || PageType[0]) ? MixSegment0 : Segment0; // megapage and higher (page == 01 or 10 or 11) // Reswizzle segments of the combined page number - assign PageNumberCombined = {Segment2Combined, Segment1Combined, Segment0Combined}; + assign PageNumberCombined = (SvMode == `SV48) ? + {Segment3Combined, Segment2Combined[LOW_SEGMENT_BITS-1:0], Segment1Combined, Segment0Combined} : + {Segment2Combined, Segment1Combined, Segment0Combined}; end endgenerate endmodule diff --git a/wally-pipelined/src/mmu/pagetablewalker.sv b/wally-pipelined/src/mmu/pagetablewalker.sv index f2aada444..b0e4fe8e5 100644 --- a/wally-pipelined/src/mmu/pagetablewalker.sv +++ b/wally-pipelined/src/mmu/pagetablewalker.sv @@ -2,7 +2,10 @@ // pagetablewalker.sv // // Written: tfleming@hmc.edu 2 March 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// implemented SV48 on top of SV39. This included, adding a level of the FSM for the extra page number segment +// adding support for terapage encoding, and for setting the TranslationPAdr using the new level, +// adding the internal SvMode signal // // Purpose: Page Table Walker // Part of the Memory Management Unit (MMU) @@ -70,6 +73,7 @@ module pagetablewalker ( logic [`XLEN-1:0] SavedPTE, CurrentPTE; logic [`PA_BITS-1:0] TranslationPAdr; logic [`PPN_BITS-1:0] CurrentPPN; + logic [`SVMODE_BITS-1:0] SvMode; logic MemStore; // PTE Control Bits @@ -82,6 +86,8 @@ module pagetablewalker ( logic [`XLEN-1:0] PageTableEntry; logic [1:0] PageType; + assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS]; + assign BasePageTablePPN = SATP_REGW[`PPN_BITS-1:0]; assign MemStore = MemRWM[0]; @@ -105,11 +111,12 @@ module pagetablewalker ( assign PageTypeF = PageType; assign PageTypeM = PageType; - localparam IDLE = 3'h0; + localparam LEVEL0 = 3'h0; localparam LEVEL1 = 3'h1; - localparam LEVEL0 = 3'h2; - localparam LEAF = 3'h3; - localparam FAULT = 3'h4; + // space left for more levels + localparam LEAF = 3'h5; + localparam IDLE = 3'h6; + localparam FAULT = 3'h7; logic [2:0] WalkerState, NextWalkerState; @@ -208,18 +215,32 @@ module pagetablewalker ( assign MMUPAdr = TranslationPAdr[31:0]; end else begin - localparam LEVEL2 = 3'h5; + localparam LEVEL2 = 3'h2; + localparam LEVEL3 = 3'h3; - logic [8:0] VPN2, VPN1, VPN0; + logic [8:0] VPN3, VPN2, VPN1, VPN0; - logic GigapageMisaligned, BadGigapage; + logic TerapageMisaligned, GigapageMisaligned, BadTerapage, BadGigapage; flopenl #(3) mmureg(HCLK, ~HRESETn, 1'b1, NextWalkerState, IDLE, WalkerState); always_comb begin case (WalkerState) - IDLE: if (MMUTranslate) NextWalkerState = LEVEL2; + IDLE: if (MMUTranslate) NextWalkerState = LEVEL3; else NextWalkerState = IDLE; + LEVEL3: if (SvMode != `SV48) NextWalkerState = LEVEL2; + // 3rd level used if SV48 is enabled. + else begin + if (~MMUReady) NextWalkerState = LEVEL3; + // *** According to the architecture, we should + // fault upon finding a superpage that is misaligned or has 0 + // access bit. The following commented line of code is + // supposed to perform that check. However, it is untested. + else if (ValidPTE && LeafPTE && ~BadTerapage) NextWalkerState = LEAF; + // else if (ValidPTE && LeafPTE) NextWalkerState = LEAF; // *** Once the above line is properly tested, delete this line. + else if (ValidPTE && ~LeafPTE) NextWalkerState = LEVEL2; + else NextWalkerState = FAULT; + end LEVEL2: if (~MMUReady) NextWalkerState = LEVEL2; // *** According to the architecture, we should // fault upon finding a superpage that is misaligned or has 0 @@ -242,24 +263,29 @@ module pagetablewalker ( else if (ValidPTE && LeafPTE && ~AccessAlert) NextWalkerState = LEAF; else NextWalkerState = FAULT; - LEAF: if (MMUTranslate) NextWalkerState = LEVEL2; + LEAF: if (MMUTranslate) NextWalkerState = LEVEL3; else NextWalkerState = IDLE; - FAULT: if (MMUTranslate) NextWalkerState = LEVEL2; + FAULT: if (MMUTranslate) NextWalkerState = LEVEL3; else NextWalkerState = IDLE; // Default case should never happen, but is included for linter. default: NextWalkerState = IDLE; endcase end + // A terapage is a level 3 leaf page. This page must have zero PPN[2], + // zero PPN[1], and zero PPN[0] + assign TerapageMisaligned = |(CurrentPPN[26:0]); // A gigapage is a Level 2 leaf page. This page must have zero PPN[1] and // zero PPN[0] assign GigapageMisaligned = |(CurrentPPN[17:0]); // A megapage is a Level 1 leaf page. This page must have zero PPN[0]. assign MegapageMisaligned = |(CurrentPPN[8:0]); + assign BadTerapage = TerapageMisaligned || AccessAlert; // *** Implement better access/dirty scheme assign BadGigapage = GigapageMisaligned || AccessAlert; // *** Implement better access/dirty scheme assign BadMegapage = MegapageMisaligned || AccessAlert; // *** Implement better access/dirty scheme + assign VPN3 = TranslationVAdr[47:39]; assign VPN2 = TranslationVAdr[38:30]; assign VPN1 = TranslationVAdr[29:21]; assign VPN0 = TranslationVAdr[20:12]; @@ -282,8 +308,13 @@ module pagetablewalker ( IDLE: begin MMUStall = '0; end + LEVEL3: begin + TranslationPAdr = {BasePageTablePPN, VPN3, 3'b000}; + // *** this is a huge breaking point. if we're going through level3 every time, even when sv48 is off, + // what should translationPAdr be when level3 is just off? + end LEVEL2: begin - TranslationPAdr = {BasePageTablePPN, VPN2, 3'b000}; + TranslationPAdr = {(SvMode == `SV48) ? CurrentPPN : BasePageTablePPN, VPN2, 3'b000}; end LEVEL1: begin TranslationPAdr = {CurrentPPN, VPN1, 3'b000}; @@ -295,8 +326,9 @@ module pagetablewalker ( // Keep physical address alive to prevent HADDR dropping to 0 TranslationPAdr = {CurrentPPN, VPN0, 3'b000}; PageTableEntry = CurrentPTE; - PageType = (WalkerState == LEVEL2) ? 2'b11 : - ((WalkerState == LEVEL1) ? 2'b01 : 2'b00); + PageType = (WalkerState == LEVEL3) ? 2'b11 : + ((WalkerState == LEVEL2) ? 2'b10 : + ((WalkerState == LEVEL1) ? 2'b01 : 2'b00)); DTLBWriteM = DTLBMissM; ITLBWriteF = ~DTLBMissM; // Prefer data over instructions end diff --git a/wally-pipelined/src/mmu/tlb.sv b/wally-pipelined/src/mmu/tlb.sv index 7ed594e45..1828c98e7 100644 --- a/wally-pipelined/src/mmu/tlb.sv +++ b/wally-pipelined/src/mmu/tlb.sv @@ -2,7 +2,9 @@ // tlb.sv // // Written: jtorrey@hmc.edu 16 February 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// Implemented SV48 on top of SV39. This included adding the SvMode signal, +// and using it to decide the translate signal and get the virtual page number // // Purpose: Translation lookaside buffer // Cache of virtural-to-physical address translations @@ -25,7 +27,7 @@ /////////////////////////////////////////// /** - * sv32 specs + * SV32 specs * ---------- * Virtual address [31:0] (32 bits) * [________________________________] @@ -85,14 +87,11 @@ module tlb #(parameter ENTRY_BITS = 3, output TLBPageFault ); - logic SvMode; logic Translate; logic TLBAccess, ReadAccess, WriteAccess; - // *** If we want to support multiple virtual memory modes (ie sv39 AND sv48), - // we could have some muxes that control which parameters are current. - // Although then some of the signals are not big enough. But that's a problem - // for much later. + // Store current virtual memory mode (SV32, SV39, SV48, ect...) + logic [`SVMODE_BITS-1:0] SvMode; // Index (currently random) to write the next TLB entry logic [ENTRY_BITS-1:0] WriteIndex; @@ -116,17 +115,24 @@ module tlb #(parameter ENTRY_BITS = 3, // Whether the virtual address has a match in the CAM logic CAMHit; - // Grab the sv bit from SATP + // Grab the sv mode from SATP + assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS]; + + // The bus width is always the largest it could be for that XLEN. For example, vpn will be 36 bits wide in rv64 + // this, even though it could be 27 bits (SV39) or 36 bits (SV48) wide. When the value of VPN is narrower, + // is shorter, the extra bits are used as padded zeros on the left of the full value. generate if (`XLEN == 32) begin - assign SvMode = SATP_REGW[31]; // *** change to an enum somehow? + assign VirtualPageNumber = VirtualAddress[`VPN_BITS+11:12]; end else begin - assign SvMode = SATP_REGW[63]; // currently just a boolean whether translation enabled + assign VirtualPageNumber = (SvMode == `SV48) ? + VirtualAddress[`VPN_BITS+11:12] : + {{`VPN_SEGMENT_BITS{1'b0}}, VirtualAddress[3*`VPN_SEGMENT_BITS+11:12]}; end endgenerate // Whether translation should occur - assign Translate = SvMode & (PrivilegeModeW != `M_MODE); + assign Translate = (SvMode != `NO_TRANSLATE) & (PrivilegeModeW != `M_MODE); // Determine how the TLB is currently being used // Note that we use ReadAccess for both loads and instruction fetches @@ -134,7 +140,7 @@ module tlb #(parameter ENTRY_BITS = 3, assign WriteAccess = TLBAccessType[0]; assign TLBAccess = ReadAccess || WriteAccess; - assign VirtualPageNumber = VirtualAddress[`VPN_BITS+11:12]; + assign PageOffset = VirtualAddress[11:0]; // TLB entries are evicted according to the LRU algorithm @@ -188,9 +194,10 @@ module tlb #(parameter ENTRY_BITS = 3, // page number. For 4 KB pages, the entire virtual page number is replaced. // For superpages, some segments are considered offsets into a larger page. page_number_mixer #(`PPN_BITS, `PPN_HIGH_SEGMENT_BITS) - physical_mixer(PhysicalPageNumber, + physical_mixer(PhysicalPageNumber, {{EXTRA_PHYSICAL_BITS{1'b0}}, VirtualPageNumber}, HitPageType, + SvMode, PhysicalPageNumberMixed); // Provide physical address only on TLBHits to cause catastrophic errors if diff --git a/wally-pipelined/src/mmu/tlb_cam.sv b/wally-pipelined/src/mmu/tlb_cam.sv index 330bb382c..78d9ff8d3 100644 --- a/wally-pipelined/src/mmu/tlb_cam.sv +++ b/wally-pipelined/src/mmu/tlb_cam.sv @@ -2,7 +2,9 @@ // tlb_cam.sv // // Written: jtorrey@hmc.edu 16 February 2021 -// Modified: +// Modified: kmacsaigoren@hmc.edu 1 June 2021 +// Implemented SV48 on top of SV39. This included adding the SvMode signal input and wally constants +// Mostly this was to make the cam_lines work. // // Purpose: Stores virtual page numbers with cached translations. // Determines whether a given virtual page number is in the TLB. @@ -24,18 +26,21 @@ // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// +`include "wally-constants.vh" + module tlb_cam #(parameter ENTRY_BITS = 3, parameter KEY_BITS = 20, parameter HIGH_SEGMENT_BITS = 10) ( - input clk, reset, - input [KEY_BITS-1:0] VirtualPageNumber, - input [1:0] PageTypeWrite, - input [ENTRY_BITS-1:0] WriteIndex, - input TLBWrite, - input TLBFlush, - output [ENTRY_BITS-1:0] VPNIndex, - output [1:0] HitPageType, - output CAMHit + input clk, reset, + input [KEY_BITS-1:0] VirtualPageNumber, + input [1:0] PageTypeWrite, + input [ENTRY_BITS-1:0] WriteIndex, + input [`SVMODE_BITS-1:0] SvMode, + input TLBWrite, + input TLBFlush, + output [ENTRY_BITS-1:0] VPNIndex, + output [1:0] HitPageType, + output CAMHit ); localparam NENTRIES = 2**ENTRY_BITS;