Files
MegaCD_MiSTer/rtl/cache_2way.sv
2020-06-19 20:09:09 +08:00

428 lines
12 KiB
Systemverilog

//
// 2015, rok.krajnc@gmail.com
// 2020, Alexey Melnikov
//
// this is a 2-way set-associative cache
// write-through, look-through
// IDX_MSB = 9: 16kB cache size, 8kB per way
// Optimized for 64bit DRAM
//
//
module cache_2way #(parameter IDX_MSB = 8)
(
// system
input clk, // clock
input rst, // cache reset
input cache_enable,
input cache_clear,
input cache_inhibit, // cache inhibit update
// cpu
input cpu_cs, // cpu activity
input [IDX_MSB+21:1] cpu_adr, // cpu address
input [1:0] cpu_bs, // cpu byte selects
input cpu_we, // cpu write
input cpu_rd, // cpu data read
input [15:0] cpu_dat_w, // cpu write data
output reg [15:0] cpu_dat_r, // cpu read data
output reg cpu_ack, // cpu acknowledge
// writebuffer
output reg wb_en, // writebuffer enable
// sdram
input [63:0] mem_dat_r, // sdram read data
output reg mem_read_req, // sdram read request from cache
input mem_read_ack // sdram read acknowledge to cache
);
//// internal signals ////
reg cc_en;
reg cc_clr;
always @ (posedge clk) begin
if (rst) begin
cc_en <= 1'b0;
cc_clr <= 1'b0;
end else if (!cpu_cs) begin
cc_en <= cache_enable;
cc_clr <= cache_clear;
end
end
// slice up cpu address
wire [1:0] cpu_adr_blk = cpu_adr[2:1]; // cache block address (inside cache row), 2 bits for 4x16 rows
wire [IDX_MSB:0] cpu_adr_idx = cpu_adr[IDX_MSB+3:3]; // cache row address, 9 bits
wire [17:0] cpu_adr_tag = cpu_adr[IDX_MSB+21:IDX_MSB+4]; // tag, 18 bits
// cpu side state machine
localparam [3:0]
CPU_SM_INIT = 4'd0,
CPU_SM_IDLE = 4'd1,
CPU_SM_WRITE = 4'd2,
CPU_SM_WB = 4'd3,
CPU_SM_READ = 4'd4,
CPU_SM_WAIT = 4'd5,
CPU_SM_FILL = 4'd6,
CPU_SM_FILLW = 4'd7;
reg [3:0] cpu_sm_state;
reg cpu_sm_dtag_we;
reg cpu_sm_dram0_we;
reg cpu_sm_dram1_we;
reg [1:0] cpu_sm_bs;
reg [15:0] cpu_sm_mem_dat_w;
reg [39:0] cpu_sm_tag_dat_w;
reg [IDX_MSB:0] upd_sm_adr;
reg upd_sm_dram0_we;
reg upd_sm_dram1_we;
reg [63:0] upd_data;
always @ (posedge clk) begin
if (rst) begin
mem_read_req <= 1'b0;
wb_en <= 1'b0;
cpu_ack <= 1'b0;
cpu_sm_state <= CPU_SM_INIT;
cpu_sm_dtag_we <= 1'b0;
cpu_sm_dram0_we <= 1'b0;
cpu_sm_dram1_we <= 1'b0;
cpu_sm_bs <= 2'b11;
upd_sm_dram0_we <= 1'b0;
upd_sm_dram1_we <= 1'b0;
end else begin
// default values
mem_read_req <= 1'b0;
wb_en <= 1'b0;
cpu_sm_dtag_we <= 1'b0;
cpu_sm_dram0_we <= 1'b0;
cpu_sm_dram1_we <= 1'b0;
cpu_sm_bs <= 2'b11;
upd_sm_dram0_we <= 1'b0;
upd_sm_dram1_we <= 1'b0;
// state machine
case (cpu_sm_state)
CPU_SM_INIT : begin
// waiting for cache init
if (cache_init_done) begin
cpu_sm_state <= CPU_SM_IDLE;
end else begin
cpu_sm_state <= CPU_SM_INIT;
end
end
CPU_SM_IDLE : begin
// waiting for CPU access
if (cpu_cs) begin
if (cpu_we) begin
cpu_sm_state <= CPU_SM_WRITE;
end else begin
cpu_sm_state <= CPU_SM_READ;
end
end else begin
if (cc_clr)
cpu_sm_state <= CPU_SM_INIT;
else
cpu_sm_state <= CPU_SM_IDLE;
end
end
CPU_SM_WRITE : begin
// on hit update cache, on miss no update neccessary; tags don't get updated on writes
cpu_sm_bs <= cpu_bs;
cpu_sm_mem_dat_w <= cpu_dat_w;
cpu_sm_dram0_we <= dtag0_match && dtag0_valid;
cpu_sm_dram1_we <= dtag1_match && dtag1_valid;
cpu_sm_state <= CPU_SM_WB;
wb_en <= 1'b1;
if (!cpu_cs) cpu_sm_state <= CPU_SM_IDLE;
end
CPU_SM_WB : begin
if (!cpu_cs) cpu_sm_state <= CPU_SM_IDLE;
else wb_en <= 1'b1;
end
CPU_SM_READ : begin
if (cc_en && dtag0_match && dtag0_valid) begin
// data is already in data cache way 0
cpu_dat_r <= ddram0_cpu_dat_r;
cpu_ack <= 1'b1;
cpu_sm_dtag_we <= 1'b1;
cpu_sm_tag_dat_w <= {1'b0, dtram_cpu_dat_r[38:0]};
cpu_sm_state <= CPU_SM_WAIT;
end
else if (cc_en && dtag1_match && dtag1_valid) begin
// data is already in data cache way 1
cpu_dat_r <= ddram1_cpu_dat_r;
cpu_ack <= 1'b1;
cpu_sm_dtag_we <= 1'b1;
cpu_sm_tag_dat_w <= {1'b1, dtram_cpu_dat_r[38:0]};
cpu_sm_state <= CPU_SM_WAIT;
end
else begin
// on miss fetch data from SDRAM
mem_read_req <= 1'b1;
cpu_sm_state <= CPU_SM_FILL;
end
end
CPU_SM_WAIT : begin
if (!cpu_cs) cpu_sm_state <= CPU_SM_IDLE;
end
CPU_SM_FILL : begin
upd_sm_adr <= cpu_adr_idx;
if (mem_read_ack) begin
// read data to cpu
cpu_dat_r <= mem_dat_r[{cpu_adr_blk, 4'b0000} +:16];
cpu_ack <= 1'b1;
if (cache_inhibit) begin
// don't update cache if caching is inhibited
cpu_sm_state <= CPU_SM_FILLW;
end else begin
// update tag ram
if (dtag_lru) begin
cpu_sm_tag_dat_w <= {1'b0, 1'b1, dtram_cpu_dat_r[37], 1'b0, dtram_cpu_dat_r[35:18], cpu_adr_tag};
end else begin
cpu_sm_tag_dat_w <= {1'b1, dtram_cpu_dat_r[38], 1'b1, 1'b0, cpu_adr_tag, dtram_cpu_dat_r[17: 0]};
end
cpu_sm_dtag_we <= 1;
// cache line fill 1st word
upd_data <= mem_dat_r;
upd_sm_dram0_we <= dtag_lru;
upd_sm_dram1_we <= !dtag_lru;
cpu_sm_state <= CPU_SM_FILLW;
end
end
end
CPU_SM_FILLW : begin
if (!cpu_ack) cpu_sm_state <= CPU_SM_IDLE;
end
endcase
// when CPU lowers its request signal, lower ack too
if (!cpu_cs) cpu_ack <= 1'b0;
end
end
//// sdram side ////
localparam [3:0]
SDR_SM_INIT0 = 4'd0,
SDR_SM_INIT1 = 4'd1,
SDR_SM_IDLE = 4'd2,
SDR_SM_WAIT = 4'd3;
reg [3:0] sdr_sm_state;
reg [8:0] sdr_sm_adr;
reg sdr_sm_dtag_we;
reg cache_init_done;
// sdram side state machine
always @ (posedge clk) begin
if (rst) begin
cache_init_done <= 1'b0;
sdr_sm_state <= SDR_SM_INIT0;
sdr_sm_dtag_we <= 1'b0;
end else begin
// default values
cache_init_done <= 1'b1;
sdr_sm_dtag_we <= 1'b0;
// state machine
case (sdr_sm_state)
SDR_SM_INIT0 : begin
// prepare to clear cache
cache_init_done <= 1'b0;
sdr_sm_adr <= 0;
sdr_sm_dtag_we <= 1'b1;
sdr_sm_state <= SDR_SM_INIT1;
end
SDR_SM_INIT1 : begin
// clear cache
cache_init_done <= 1'b0;
sdr_sm_adr <= sdr_sm_adr + 1'd1;
sdr_sm_dtag_we <= 1'b1;
if (&sdr_sm_adr) begin
sdr_sm_state <= SDR_SM_IDLE;
end else begin
sdr_sm_state <= SDR_SM_INIT1;
end
end
SDR_SM_IDLE : begin
if (cc_clr) sdr_sm_state <= SDR_SM_INIT0;
end
endcase
end
end
//// data data memories ////
// data tag ram
wire dtag0_match = (cpu_adr_tag == dtram_cpu_dat_r[17:0]);
wire dtag1_match = (cpu_adr_tag == dtram_cpu_dat_r[35:18]);
wire dtag_lru = dtram_cpu_dat_r[39];
wire dtag0_valid = dtram_cpu_dat_r[38];
wire dtag1_valid = dtram_cpu_dat_r[37];
wire [39:0] dtram_cpu_dat_r;
dpram #(IDX_MSB+1,40) dtram (
.clock (clk ),
.address_a (cpu_adr_idx ),
.wren_a (cpu_sm_dtag_we ),
.data_a (cpu_sm_tag_dat_w ),
.q_a (dtram_cpu_dat_r ),
.address_b (sdr_sm_adr ),
.wren_b (sdr_sm_dtag_we )
);
// data data ram 0
wire [15:0] ddram0_cpu_dat_r;
cache_be #(IDX_MSB) ddram0 (
.clock (clk ),
.address_a ({cpu_adr_idx, cpu_adr_blk}),
.byteena_a (cpu_sm_bs ),
.wren_a (cpu_sm_dram0_we ),
.data_a (cpu_sm_mem_dat_w ),
.q_a (ddram0_cpu_dat_r ),
.address_b (upd_sm_adr ),
.wren_b (upd_sm_dram0_we ),
.data_b (upd_data )
);
// data data ram 1
wire [15:0] ddram1_cpu_dat_r;
cache_be #(IDX_MSB) ddram1 (
.clock (clk ),
.address_a ({cpu_adr_idx, cpu_adr_blk}),
.byteena_a (cpu_sm_bs ),
.wren_a (cpu_sm_dram1_we ),
.data_a (cpu_sm_mem_dat_w ),
.q_a (ddram1_cpu_dat_r ),
.address_b (upd_sm_adr ),
.wren_b (upd_sm_dram1_we ),
.data_b (upd_data )
);
endmodule
module cache_be #(parameter IDX_MSB)
(
input clock,
input [IDX_MSB+2:0] address_a,
input [1:0] byteena_a,
input [15:0] data_a,
input wren_a,
output [15:0] q_a,
input [IDX_MSB:0] address_b,
input [63:0] data_b,
input wren_b,
output [63:0] q_b
);
altsyncram altsyncram_component
(
.address_a (address_a),
.address_b (address_b),
.byteena_a (byteena_a),
.clock0 (clock),
.data_a (data_a),
.data_b (data_b),
.wren_a (wren_a),
.wren_b (wren_b),
.q_a (q_a),
.q_b (q_b),
.aclr0 (1'b0),
.aclr1 (1'b0),
.addressstall_a (1'b0),
.addressstall_b (1'b0),
.byteena_b (1'b1),
.clock1 (1'b1),
.clocken0 (1'b1),
.clocken1 (1'b1),
.clocken2 (1'b1),
.clocken3 (1'b1),
.eccstatus (),
.rden_a (1'b1),
.rden_b (1'b1)
);
defparam
altsyncram_component.address_reg_b = "CLOCK0",
altsyncram_component.byte_size = 8,
altsyncram_component.clock_enable_input_a = "BYPASS",
altsyncram_component.clock_enable_input_b = "BYPASS",
altsyncram_component.clock_enable_output_a = "BYPASS",
altsyncram_component.clock_enable_output_b = "BYPASS",
altsyncram_component.indata_reg_b = "CLOCK0",
altsyncram_component.intended_device_family = "Cyclone V",
altsyncram_component.lpm_type = "altsyncram",
altsyncram_component.numwords_a = 2**(IDX_MSB+3),
altsyncram_component.numwords_b = 2**(IDX_MSB+1),
altsyncram_component.operation_mode = "BIDIR_DUAL_PORT",
altsyncram_component.outdata_aclr_a = "NONE",
altsyncram_component.outdata_aclr_b = "NONE",
altsyncram_component.outdata_reg_a = "UNREGISTERED",
altsyncram_component.outdata_reg_b = "UNREGISTERED",
altsyncram_component.power_up_uninitialized = "FALSE",
altsyncram_component.read_during_write_mode_mixed_ports = "DONT_CARE",
altsyncram_component.read_during_write_mode_port_a = "NEW_DATA_NO_NBE_READ",
altsyncram_component.read_during_write_mode_port_b = "NEW_DATA_NO_NBE_READ",
altsyncram_component.widthad_a = IDX_MSB+3,
altsyncram_component.widthad_b = IDX_MSB+1,
altsyncram_component.width_a = 16,
altsyncram_component.width_b = 64,
altsyncram_component.width_byteena_a = 2,
altsyncram_component.width_byteena_b = 1,
altsyncram_component.wrcontrol_wraddress_reg_b = "CLOCK0";
endmodule
/*
module cache_be
(
input clock,
input [11:0] address_a,
input [1:0] byteena_a,
input [15:0] data_a,
input wren_a,
output [15:0] q_a,
input [9:0] address_b,
input [63:0] data_b,
input wren_b
);
generate
genvar i;
for(i=0; i<8; i++) begin: ramblock
wire [7:0] dout;
if(i[0])
assign q_a[15:8] = (address_a[1:0] == i[2:1]) ? dout : 8'bZ;
else
assign q_a[7:0] = (address_a[1:0] == i[2:1]) ? dout : 8'bZ;
spram #(10,8," ",{"MEM",{4'h3,i[3:0]}}) ram
(
.clock(clock),
.address(wren_b ? address_b : address_a[11:2]),
.data(wren_b ? data_b[(i<<3) +:8] : i[0] ? data_a[15:8] : data_a[7:0]),
.wren(wren_b | (wren_a & byteena_a[i[0]])),
.cs(wren_b | address_a[1:0] == i[2:1]),
.q(dout)
);
end
endgenerate
endmodule
*/