diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index b07e64056..d8083bb6c 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -40,18 +40,26 @@ module icache( output logic [31:0] InstrRawD ); - logic DelayF, DelaySideF, FlushDLastCycle; - logic [1:0] InstrDMuxChoice; - logic [15:0] MisalignedHalfInstrF, MisalignedHalfInstrD; - logic [31:0] InstrF, AlignedInstrD; - logic [31:0] nop = 32'h00000013; // instruction for NOP + logic DelayF, DelaySideF, FlushDLastCycle; + logic [1:0] InstrDMuxChoice; + logic [15:0] MisalignedHalfInstrF, MisalignedHalfInstrD; + logic [31:0] InstrF, AlignedInstrD; + logic [31:0] nop = 32'h00000013; // instruction for NOP + logic LastReadDataValidF; + logic [`XLEN-1:0] LastReadDataF, LastReadAdrF, InDataF; flopr #(1) flushDLastCycleFlop(clk, reset, FlushD | (FlushDLastCycle & StallF), FlushDLastCycle); flopenr #(1) delayStateFlop(clk, reset, ~StallF, (DelayF & ~DelaySideF) ? 1'b1 : 1'b0 , DelaySideF); flopenr #(16) halfInstrFlop(clk, reset, DelayF & ~StallF, MisalignedHalfInstrF, MisalignedHalfInstrD); + // This flop is here to simulate pulling data out of the cache, which is edge-triggered flopenr #(32) instrFlop(clk, reset, ~StallF, InstrF, AlignedInstrD); + // These flops cache the previous read, to accelerate things + flopenr #(`XLEN) lastReadDataFlop(clk, reset, InstrReadF & ~StallF, InstrInF, LastReadDataF); + flopenr #(1) lastReadDataVFlop(clk, reset, InstrReadF & ~StallF, 1'b1, LastReadDataValidF); + flopenr #(`XLEN) lastReadAdrFlop(clk, reset, InstrReadF & ~StallF, InstrPAdrF, LastReadAdrF); + // Decide which address needs to be fetched and sent out over InstrPAdrF // If the requested address fits inside one read from memory, we fetch that // address, adjusted to the bit width. Otherwise, we request the lower word @@ -64,34 +72,47 @@ module icache( end endgenerate // For now, we always read since the cache doesn't actually cache - assign InstrReadF = 1; + + always_comb if (LastReadDataValidF & (InstrPAdrF == LastReadAdrF)) begin + assign InstrReadF = 0; + end else begin + assign InstrReadF = 1; + end + + // Pick from the memory input or from the previous read, as appropriate + mux2 #(`XLEN) inDataMux(LastReadDataF, InstrInF, InstrReadF, InDataF); // If the instruction fits in one memory read, then we put the right bits // into InstrF. Otherwise, we activate DelayF to signal the rest of the // machinery to swizzle bits. generate if (`XLEN == 32) begin - assign InstrF = PCPF[1] ? {16'b0, InstrInF[31:16]} : InstrInF; + assign InstrF = PCPF[1] ? {16'b0, InDataF[31:16]} : InDataF; assign DelayF = PCPF[1]; - assign MisalignedHalfInstrF = InstrInF[31:16]; + assign MisalignedHalfInstrF = InDataF[31:16]; end else begin - assign InstrF = PCPF[2] ? (PCPF[1] ? {16'b0, InstrInF[63:48]} : InstrInF[63:32]) : (PCPF[1] ? InstrInF[47:16] : InstrInF[31:0]); + assign InstrF = PCPF[2] ? (PCPF[1] ? {16'b0, InDataF[63:48]} : InDataF[63:32]) : (PCPF[1] ? InDataF[47:16] : InDataF[31:0]); assign DelayF = PCPF[1] && PCPF[2]; - assign MisalignedHalfInstrF = InstrInF[63:48]; + assign MisalignedHalfInstrF = InDataF[63:48]; end endgenerate - assign ICacheStallF = DelayF & ~DelaySideF; + assign ICacheStallF = 0; //DelayF & ~DelaySideF; // Detect if the instruction is compressed - // TODO Low-hanging optimization, don't delay if compressed - assign CompressedF = DelaySideF ? (MisalignedHalfInstrD[1:0] != 2'b11) : (InstrF[1:0] != 2'b11); + // TODO Low-hanging optimization, don't delay if getting a compressed instruction + assign CompressedF = (DelaySideF & DelayF) ? (MisalignedHalfInstrD[1:0] != 2'b11) : (InstrF[1:0] != 2'b11); // Pick the correct output, depending on whether we have to assemble this // instruction from two reads or not. // Output the requested instruction (we don't need to worry if the read is // incomplete, since the pipeline stalls for us when it isn't), or a NOP for // the cycle when the first of two reads comes in. - always_comb - assign InstrDMuxChoice = FlushDLastCycle ? 2'b10 : (DelayF ? (DelaySideF ? 2'b01 : 2'b10) : 2'b00); - mux3 #(32) instrDMux (AlignedInstrD, {InstrInF[15:0], MisalignedHalfInstrD}, nop, InstrDMuxChoice, InstrRawD); + always_comb if (DelayF & (MisalignedHalfInstrF[1:0] != 2'b11)) begin + assign InstrDMuxChoice = 2'b11; + end else if (FlushDLastCycle) begin + assign InstrDMuxChoice = 2'b10; + end else begin + assign InstrDMuxChoice = {1'b0, DelaySideF}; + end + mux4 #(32) instrDMux (AlignedInstrD, {InstrInF[15:0], MisalignedHalfInstrD}, nop, {16'b0, MisalignedHalfInstrD}, InstrDMuxChoice, InstrRawD); endmodule diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index 8947482ea..334e75e24 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -53,14 +53,13 @@ module testbench(); // "rv64m/I-REMW-01", "3000" }; string tests64ic[] = '{ - + "rv64ic/I-C-BEQZ-01", "3000", "rv64ic/I-C-ADD-01", "3000", "rv64ic/I-C-ADDI-01", "3000", "rv64ic/I-C-ADDIW-01", "3000", "rv64ic/I-C-ADDW-01", "3000", "rv64ic/I-C-AND-01", "3000", "rv64ic/I-C-ANDI-01", "3000", - "rv64ic/I-C-BEQZ-01", "3000", "rv64ic/I-C-BNEZ-01", "3000", "rv64ic/I-C-EBREAK-01", "2000", "rv64ic/I-C-J-01", "3000",