diff --git a/dv/verilator/sonata_system.cc b/dv/verilator/sonata_system.cc index 09d9f4a8e..95d24d43c 100644 --- a/dv/verilator/sonata_system.cc +++ b/dv/verilator/sonata_system.cc @@ -16,9 +16,13 @@ SonataSystem::SonataSystem(const char *ram_hier_path, int ram_size_words, const char *hyperram_hier_path, int hyperram_size_words) : _ram(ram_hier_path, ram_size_words, 4), -#ifdef USE_HYPERRAM_SIM_MODEL +#ifdef USE_HYPERRAM_SRAM_MODEL + // The SRAM model within the `hyperram` IP block is 32 bits wide to + // match the TL-UL bus. _hyperram(hyperram_hier_path, hyperram_size_words, 4) {} #else + // The simulation model of the W956 HyperRAM chip employs a memory + // that is 16 bits wide, as per the HyperBus protocol. _hyperram(hyperram_hier_path, hyperram_size_words / 2, 2) {} #endif diff --git a/dv/verilator/sonata_system_main.cc b/dv/verilator/sonata_system_main.cc index c2da48313..5c5b2f82a 100644 --- a/dv/verilator/sonata_system_main.cc +++ b/dv/verilator/sonata_system_main.cc @@ -8,9 +8,9 @@ int main(int argc, char **argv) { SonataSystem sonata_system( "TOP.top_verilator.u_sonata_system.u_sram_top.u_ram.gen_generic.u_impl_generic", 32 * 1024, // 32k words = 128 KiB -#ifdef USE_HYPERRAM_SIM_MODEL +#ifdef USE_HYPERRAM_SRAM_MODEL // Simple SRAM model used within the Sonata System for faster simulations. - "TOP.top_verilator.u_sonata_system.u_hyperram.u_hyperram_model.u_ram.gen_generic.u_impl_generic", + "TOP.top_verilator.u_sonata_system.u_hyperram.gen_dual_port.u_hyperram_model.u_ram.gen_generic.u_impl_generic", #else // HyperRAM simulation model external to the Sonata System; driven by HBMC. "TOP.top_verilator.u_hyperram_W956.u_ram.gen_generic.u_impl_generic", diff --git a/dv/verilator/sonata_verilator_lint.vlt b/dv/verilator/sonata_verilator_lint.vlt index 2500827ea..e694ec1a9 100644 --- a/dv/verilator/sonata_verilator_lint.vlt +++ b/dv/verilator/sonata_verilator_lint.vlt @@ -106,9 +106,11 @@ lint_off -rule WIDTHTRUNC -file "*hbmc_iobuf.v" lint_off -rule UNUSED -file "*hbmc_clk_obuf.v" lint_off -rule UNUSED -file "*hbmc_iobuf.v" -lint_off -rule UNOPTFLAT -file "*hbmc_tl_top.sv" +lint_off -rule UNOPTFLAT -file "*hbmc_tl_port.sv" lint_off -rule UNUSED -file "*hbmc_tl_top.sv" +lint_off -rule MULTIDRIVEN -file "*prim_arbiter_fixed.sv" + // Disable warnings in models of FPGA primitives. lint_off -rule UNUSED -file "*IOBUF.v" lint_off -rule UNUSED -file "*ISERDESE2.v" diff --git a/dv/verilator/top_verilator.sv b/dv/verilator/top_verilator.sv index 6c0553e97..3450fa258 100644 --- a/dv/verilator/top_verilator.sv +++ b/dv/verilator/top_verilator.sv @@ -177,7 +177,7 @@ module top_verilator #( wire unused_io_ = ^{mb1, ah_tmpio10, rph_g18, rph_g17, rph_g16_ce2, rph_g8_ce0, rph_g7_ce1, usrLed}; - +`ifndef TARGET_XL_BOARD // HyperRAM interface. wire [7:0] hyperram_dq; wire hyperram_rwds; @@ -185,7 +185,7 @@ module top_verilator #( wire hyperram_ckn; wire hyperram_nrst; wire hyperram_cs; - +`endif // Reporting of CHERI enable/disable and any exceptions that occur. wire [CheriErrWidth-1:0] cheri_err; logic [CheriErrWidth-1:0] cheri_errored; @@ -356,11 +356,16 @@ module top_verilator #( .clk_usb_i (clk_usb), .rst_usb_ni (rst_usb_n), + // HyperRAM clocks and reset +`ifdef TARGET_XL_BOARD + // No HyperRAM on Sonata XL +`else // Hyperram clocks .clk_hr_i (clk_hr), .clk_hr90p_i (clk_hr90p), .clk_hr3x_i (clk_hr3x), .rst_hr_ni (rst_hr_n), +`endif .gp_i ({ 15'b0, @@ -430,12 +435,16 @@ module top_verilator #( .rgbled_dout_o (), +`ifdef TARGET_XL_BOARD + // No HyperRAM on Sonata XL +`else .hyperram_dq (hyperram_dq), .hyperram_rwds (hyperram_rwds), .hyperram_ckp (hyperram_ckp), .hyperram_ckn (hyperram_ckn), .hyperram_nrst (hyperram_nrst), .hyperram_cs (hyperram_cs), +`endif .rs485_tx_enable_o(rs485_tx_enable), .rs485_rx_enable_o(rs485_rx_enable), @@ -663,6 +672,11 @@ module top_verilator #( .rx_i (rs485_uartdpi_rx) ); +`ifdef TARGET_XL_BOARD + // No HyperRAM on Sonata XL + logic unused_hr; + assign unused_hr = ^{clk_hr, clk_hr90p, clk_hr3x, rst_hr_n}; +`else // HyperRAM model (based on W956D8MBYA5I). hyperram_W956 u_hyperram_W956 ( // Asynchronous reset signal. @@ -677,6 +691,7 @@ module top_verilator #( // Bidirectional data bus. .dq (hyperram_dq) ); +`endif export "DPI-C" function mhpmcounter_get; diff --git a/rtl/ip/hyperram/hyperram.core b/rtl/ip/hyperram/hyperram.core index 131cb7c19..422d6bde6 100644 --- a/rtl/ip/hyperram/hyperram.core +++ b/rtl/ip/hyperram/hyperram.core @@ -11,6 +11,9 @@ filesets: - open_hbmc:hyperram:controller files: - rtl/hyperram.sv + - rtl/hyperram_rdbuf.sv + - rtl/hyperram_wrbuf.sv + - rtl/hbmc_tl_port.sv - rtl/hbmc_tl_top.sv file_type: systemVerilogSource diff --git a/rtl/ip/hyperram/rtl/hbmc_dfifo.sv b/rtl/ip/hyperram/rtl/hbmc_dfifo.sv index 543bd1d0c..535d1e87a 100644 --- a/rtl/ip/hyperram/rtl/hbmc_dfifo.sv +++ b/rtl/ip/hyperram/rtl/hbmc_dfifo.sv @@ -2,16 +2,16 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -// Reimplementation of hbmc_dfifo using OpenTitan primitives, only works for DATA_WIDTH == 32 -module hbmc_dfifo # -( - parameter integer DATA_WIDTH = 32 -) -( +// Reimplementation of hbmc_dfifo using OpenTitan primitives, only works for DataWidth == 32 + +module hbmc_dfifo #( + parameter int unsigned DataWidth = 32, // Width of data words, bits. + parameter int unsigned FIFODepth = 8 // Depth of FIFO, entries. +) ( input wire fifo_wr_clk, input wire fifo_wr_nrst, - input wire [DATA_WIDTH - 1:0] fifo_wr_din, - input wire [DATA_WIDTH/8 - 1:0] fifo_wr_strb, + input wire [DataWidth - 1:0] fifo_wr_din, + input wire [DataWidth/8 - 1:0] fifo_wr_strb, input wire fifo_wr_ena, output wire fifo_wr_full, @@ -22,8 +22,8 @@ module hbmc_dfifo # input wire fifo_rd_ena, output wire fifo_rd_empty ); - // FIFO contains 32-bit data word and 4-bit strobes - localparam int unsigned FIFOWidth = DATA_WIDTH + 4; + // FIFO contains 32-bit data word and 4 bit strobes + localparam int unsigned FIFOWidth = DataWidth + (DataWidth / 8); logic [FIFOWidth-1:0] fifo_wdata, fifo_rdata; logic fifo_wready, fifo_rvalid, fifo_rready; @@ -35,7 +35,7 @@ module hbmc_dfifo # prim_fifo_async #( .Width(FIFOWidth), - .Depth(4) + .Depth(FIFODepth) ) u_fifo ( .clk_wr_i(fifo_wr_clk), .rst_wr_ni(fifo_wr_nrst), @@ -65,8 +65,8 @@ module hbmc_dfifo # end initial begin - if (DATA_WIDTH != 32) begin - $fatal("hbmc_dfifo only supports DATA_WIDTH of 32"); + if (DataWidth != 32) begin + $fatal("hbmc_dfifo only supports DataWidth of 32"); end end endmodule diff --git a/rtl/ip/hyperram/rtl/hbmc_tl_port.sv b/rtl/ip/hyperram/rtl/hbmc_tl_port.sv new file mode 100644 index 000000000..0a8759c25 --- /dev/null +++ b/rtl/ip/hyperram/rtl/hbmc_tl_port.sv @@ -0,0 +1,475 @@ +// Copyright lowRISC contributors. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// A port provides read access to the HyperRAM, and optionally write access too. +// It retains up to a full burst of read data and must maintain coherency with any write traffic +// in the event that writes are supported. +// +// An instruction port need not support write operations and does not require tag bits. +module hbmc_tl_port import tlul_pkg::*; #( + parameter int unsigned HyperRAMAddrW = 20, + // log2(burst length in bytes) + parameter int unsigned Log2BurstLen = 5, // 32-byte bursts. + parameter int unsigned NumBufs = 4, + parameter int unsigned PortIDWidth = 1, + parameter int unsigned Log2MaxBufs = 2, + parameter int unsigned SeqWidth = 6, + // + // Does this port need to support TileLink write operations? + parameter bit SupportWrites = 1, + // Coalesce write transfers into burst writes to the HBMC? + parameter bit CoalesceWrites = 1, + + // Derived address bit parameters. + localparam int unsigned ABIT = $clog2(top_pkg::TL_DW / 8), + localparam int unsigned BBIT = Log2BurstLen +) ( + input clk_i, + input rst_ni, + + // Constant indicating port number. + input [PortIDWidth-1:0] portid_i, + + // TL-UL interface. + input tl_h2d_t tl_i, + output tl_d2h_t tl_o, + + // Write notification input. + input wr_notify_i, + input [HyperRAMAddrW-1:ABIT] wr_notify_addr_i, + input [top_pkg::TL_DBW-1:0] wr_notify_mask_i, + input [top_pkg::TL_DW-1:0] wr_notify_data_i, + + // Write notification output. + output logic wr_notify_o, + output logic [top_pkg::TL_DBW-1:0] wr_notify_mask_o, + output logic [top_pkg::TL_DW-1:0] wr_notify_data_o, + output logic [HyperRAMAddrW-1:ABIT] wr_notify_addr_o, + + // Command data to the HyperRAM controller; command, address and burst length + output logic cmd_req_o, + input cmd_wready_i, + output logic [HyperRAMAddrW-1:ABIT] cmd_mem_addr_o, + output logic [Log2BurstLen-ABIT:0] cmd_word_cnt_o, + output logic cmd_wr_not_rd_o, + output logic cmd_wrap_not_incr_o, + output logic [SeqWidth-1:0] cmd_seq_o, + + output logic tag_cmd_req, + output logic [HyperRAMAddrW-1:ABIT] tag_cmd_mem_addr, + output logic tag_cmd_wr_not_rd, + output tag_cmd_wcap, + + output logic dfifo_wr_ena_o, + input dfifo_wr_full_i, + output [top_pkg::TL_DBW-1:0] dfifo_wr_strb_o, + output [top_pkg::TL_DW-1:0] dfifo_wr_din_o, + + // Read data from the HyperRAM + output ufifo_rd_ena, + input ufifo_rd_empty, + input [top_pkg::TL_DW-1:0] ufifo_rd_dout, + input [SeqWidth-1:0] ufifo_rd_seq, + input ufifo_rd_last, + + // Tag read data interface. + output tag_rdata_rready, + input tl_tag_bit +); + +/*----------------------------------------------------------------------------------------------------------------------------*/ + + logic tl_req_fifo_wready; + logic tl_req_fifo_le1; + logic wr_notify_match; + logic dfifo_wr_full; + logic cmd_wready; + logic can_accept; + logic rdbuf_hit; + logic rdbuf_re; + logic issue; + + // We can accept an incoming TileLink transaction when we've got space in the hyperram, tag + // and TileLink request FIFOs. If we're taking in a write transaction we also need space in the + // downstream FIFO(dfifo) for the write data. + // + // If a read hits in the buffer but the data is not yet available, wait until it arrives + // from the HyperRAM controller. This is indicated by the 'valid' bit becoming set for that data + // word. + // + // Note: If a read hits in the RAM we wait until the TL request FIFO has at most a single entry + // because we don't have a FIFO for the read data itself. + + assign can_accept = tl_req_fifo_wready && + ((rd_req & rdbuf_valid & tl_req_fifo_le1) || (!rdbuf_hit && cmd_wready)) && + (tl_i.a_opcode == Get || ~dfifo_wr_full) & + ~(wr_notify_i & wr_notify_match); + +/*----------------------------------------------------------------------------------------------------------------------------*/ + + wire rd_req = tl_i.a_valid & (tl_i.a_opcode == Get); + wire wr_req = tl_i.a_valid & (tl_i.a_opcode == PutFullData || tl_i.a_opcode == PutPartialData); + +/*----------------------------------------------------------------------------------------------------------------------------*/ + if (SupportWrites) begin + // Issue write notifications. + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + wr_notify_o <= 1'b0; + end else begin + // Notification of a write occurring on this port. + wr_notify_o <= wr_req & can_accept; + // Address to which the write was performed. + wr_notify_addr_o <= tl_i.a_address[HyperRAMAddrW-1:ABIT]; + // Mask specifying the sub-words being written. + wr_notify_mask_o <= tl_i.a_mask; + // Data being written. + wr_notify_data_o <= tl_i.a_data; + end + end + end else begin + // Do not issue write notifications from this port. + assign wr_notify_o = 1'b0; + assign wr_notify_addr_o = '0; + assign wr_notify_mask_o = '0; + assign wr_notify_data_o = '0; + end + +/*----------------------------------------------------------------------------------------------------------------------------*/ + + logic rdbuf_matches; // Address matches within the read buffer. + logic rdbuf_valid; // Valid data is available within the read buffer. + logic [SeqWidth-1:0] rdbuf_seq; // Sequence number of read buffer contents. + logic [top_pkg::TL_DW-1:0] rdbuf_dout; + + // Invalidate the read buffer contents when a write occurs. + // + // Write notifications have the highest priority and must immediately update or invalidate + // the contents of the read buffer in the event of a collision. `wr_notify_i` is asserted for a + // single cycle. + // + // The read buffer informs of us when a write notification hits in the buffer, and any + // simultaneous system bus transaction must then be delayed because the buffer is busy. + wire rdbuf_invalidate = &{SupportWrites, wr_req, rdbuf_matches, ~rdbuf_valid}; + wire rdbuf_update = &{SupportWrites, wr_req, rdbuf_valid}; + + // Issue a new burst read if a read is performed outside of the current buffered address range. + wire rdbuf_set = rd_req & ~rdbuf_matches & issue; + assign rdbuf_hit = rd_req & rdbuf_matches; + // Read data available and can issue the read in this cycle. The read buffer will return the + // data in the following cycle. + assign rdbuf_re = &{rd_req, rdbuf_valid, issue}; + + // Read buffer retains up to `NumBufs` burst(s) of data read from the HyperRAM for this port; + // the data arrives incrementally and may be returned as soon as it becomes available. + // + // Hit tests are performed in parallel on both the address for the current TL-UL transaction + // and any write notification. The `matches` outputs indicate an address hit on one of the + // internal burst buffers, and the `valid` output indicates that that there is valid data + // available for the specified word being addressed. + hyperram_rdbuf #( + .AW (HyperRAMAddrW), + .DW (top_pkg::TL_DW), + .DBW (top_pkg::TL_DBW), + .NumBufs (NumBufs), + .PortIDWidth (PortIDWidth), + .Log2MaxBufs (Log2MaxBufs), + .SeqWidth (SeqWidth), + .BBIT (BBIT) + ) u_readbuf( + .clk_i (clk_i), + .rst_ni (rst_ni), + + // Constant indicating the port number. + .portid_i (portid_i), + + // Read/update hit test. + .addr_i (tl_i.a_address[HyperRAMAddrW-1:ABIT]), + .mask_i (tl_i.a_mask), + .data_i (tl_i.a_data), + .matches_o (rdbuf_matches), + .valid_o (rdbuf_valid), + + // Write notification test. + .wr_notify_i (wr_notify_i), + .wr_notify_addr_i (wr_notify_addr_i[HyperRAMAddrW-1:ABIT]), + .wr_notify_mask_i (wr_notify_mask_i), + .wr_notify_data_i (wr_notify_data_i), + .wr_matches_o (wr_notify_match), + + // Control of buffer content. + .invalidate_i (rdbuf_invalidate), + .update_i (rdbuf_update), + .set_i (rdbuf_set), + .seq_o (rdbuf_seq), + + // Reading from buffer. + .read_i (rdbuf_re), + .rdata_o (rdbuf_dout), + + // Writing into buffer. + .write_i (ufifo_rd_ena), + .wseq_i (ufifo_rd_seq), + .wdata_i (ufifo_rd_dout) + ); + +/*----------------------------------------------------------------------------------------------------------------------------*/ + + localparam int unsigned TL_REQ_FIFO_DEPTH = 4; + localparam int unsigned TLReqFifoDepthW = prim_util_pkg::vbits(TL_REQ_FIFO_DEPTH+1); + + // Metadata from inbound TileLink transactions that needs to be saved to produce the response + typedef struct packed { + logic [top_pkg::TL_AIW-1:0] tl_source; + logic [top_pkg::TL_SZW-1:0] tl_size; + logic cmd_fetch; + logic cmd_wr_not_rd; + } tl_req_info_t; + + tl_req_info_t tl_req_fifo_wdata, tl_req_fifo_rdata; + + logic tl_req_fifo_wvalid; + logic tl_req_fifo_rvalid, tl_req_fifo_rready; + logic [TLReqFifoDepthW-1:0] tl_req_fifo_depth; + + // Reads from the buffer are issued into the TL request FIFO only when it has at most a single + // entry, because otherwise we would need additional storage for the read data. + assign tl_req_fifo_le1 = ~|tl_req_fifo_depth[TLReqFifoDepthW-1:1]; + + tl_d2h_t tl_o_int; + + // To be a contender in the arbitration among all ports, we need to express our intention + // to write into the command buffer. + wire cmd_req = tl_i.a_valid && tl_req_fifo_wready && !rdbuf_hit && + (tl_i.a_opcode == Get || ~dfifo_wr_full); + + assign issue = tl_i.a_valid & can_accept; + + // Logic for handling incoming TileLink requests + logic cmd_wr_not_rd; + logic dfifo_wr_ena; + always_comb begin + cmd_wr_not_rd = (tl_i.a_opcode != Get); + tag_cmd_req = 1'b0; + dfifo_wr_ena = 1'b0; + tl_req_fifo_wvalid = 1'b0; + + if (issue) begin + // Write to the relevant FIFOs and indicate ready on TileLink A channel + tag_cmd_req = 1'b1; + tl_req_fifo_wvalid = 1'b1; + + if (tl_i.a_opcode != Get) begin + dfifo_wr_ena = 1'b1; + end + end + end + + assign tag_cmd_wr_not_rd = cmd_wr_not_rd; + assign tag_cmd_mem_addr = tl_i.a_address[HyperRAMAddrW-1:ABIT]; + + wire tl_cmd_fetch = ~(rd_req & rdbuf_valid); + wire tl_cmd_wr_not_rd = (tl_i.a_opcode != Get); + + assign tl_req_fifo_wdata = '{ + tl_source : tl_i.a_source, + tl_size : tl_i.a_size, + cmd_fetch : tl_cmd_fetch, + cmd_wr_not_rd : tl_cmd_wr_not_rd + }; + + // We decant the read data from the 'Upstream FIFO' into the read buffer as soon as possible, + // both to prevent the FIFO from overflowing and to avoid holding up other read ports. + // + // Note that we must extract all of the data that we requested, even if it's no longer relevant + // and shall ultimately be discarded, i.e. check only the port ID number here. + assign ufifo_rd_ena = (ufifo_rd_seq[SeqWidth-1:SeqWidth-PortIDWidth] == portid_i) & + ~ufifo_rd_empty; + + // Track the reading of bursts from the HyperRAM controller. + logic ufifo_rd_bursting; + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + ufifo_rd_bursting <= 1'b0; + end else if (ufifo_rd_ena & (ufifo_rd_last | ~ufifo_rd_bursting)) + ufifo_rd_bursting <= !ufifo_rd_last; + end + + // First word of data returned as part of a burst read operation. + logic [top_pkg::TL_DW-1:0] ufifo_dout_first; + assign ufifo_dout_first = ufifo_rd_dout[top_pkg::TL_DW-1:0]; + + // If the data from the read buffer is not accepted immediately by the host we must register it + // to prevent it being invalidated by another read. + logic rdata_valid_q; + logic [top_pkg::TL_DW-1:0] rdata_q; + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + rdata_valid_q <= 1'b0; + end else if (tl_o_int.d_valid) begin + if (tl_i.d_ready) rdata_valid_q <= 1'b0; // Response sent. + else begin + // Capture read data and keep it stable until it is accepted by the host. + rdata_valid_q <= !tl_req_fifo_rdata.cmd_wr_not_rd; + if (!rdata_valid_q) begin + rdata_q <= tl_req_fifo_rdata.cmd_fetch ? ufifo_dout_first : rdbuf_dout; + end + end + end + end + + // Logic for sending out TileLink responses. + // - write responses may be sent as soon as the host is ready to accept them. + // - read responses may be sent as soon as the first word of data is available; we employ + // wrapping bursts to ensure that the first word of the burst is the one being requested + // by the TileLink host. + always_comb begin + tl_o_int = '0; + if (tl_req_fifo_rvalid) begin + // We have an incoming request that needs a response + if (tl_req_fifo_rdata.cmd_wr_not_rd) begin + // If it's a write then return an immediate response (early response is reasonable as any + // read that could observe the memory cannot occur until the write has actually happened) + tl_o_int.d_valid = 1'b1; + end else begin + // Otherwise wait until we have the first word of data to return. + tl_o_int.d_valid = |{ufifo_rd_ena & ~ufifo_rd_bursting, // Initial word of burst read. + ~tl_req_fifo_rdata.cmd_fetch, // From read buffer. + rdata_valid_q}; // Holding read data stable until accepted. + end + end + + tl_o_int.d_opcode = tl_req_fifo_rdata.cmd_wr_not_rd ? AccessAck : AccessAckData; + tl_o_int.d_size = tl_req_fifo_rdata.tl_size; + tl_o_int.d_source = tl_req_fifo_rdata.tl_source; + tl_o_int.d_data = rdata_valid_q ? rdata_q : + (tl_req_fifo_rdata.cmd_fetch ? ufifo_dout_first : rdbuf_dout); + tl_o_int.d_user.capability = tl_tag_bit; + tl_o_int.a_ready = issue; + end + + // Complete the TL request as soon the response is accepted; this avoids the need to register + // the properties of the response. + assign tl_req_fifo_rready = tl_o_int.d_valid & tl_i.d_ready; + + // Discard the tag read data once the _read_ data is accepted. + assign tag_rdata_rready = tl_o_int.d_valid & tl_i.d_ready & ~tl_req_fifo_rdata.cmd_wr_not_rd; + + // Generate integrity for outgoing response. + tlul_rsp_intg_gen #( + .EnableRspIntgGen(0), + .EnableDataIntgGen(0) + ) u_tlul_rsp_intg_gen ( + .tl_i (tl_o_int), + .tl_o (tl_o) + ); + + // Queue of pending TileLink requests. + prim_fifo_sync #( + .Width($bits(tl_req_info_t)), + .Depth(TL_REQ_FIFO_DEPTH), + .Pass(1'b0) + ) u_tl_req_fifo ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .clr_i (1'b0), + .wvalid_i (tl_req_fifo_wvalid), + .wready_o (tl_req_fifo_wready), + .wdata_i (tl_req_fifo_wdata), + .rvalid_o (tl_req_fifo_rvalid), + .rready_i (tl_req_fifo_rready), + .rdata_o (tl_req_fifo_rdata), + + .full_o (), + .depth_o (tl_req_fifo_depth), + .err_o () + ); + + // Command requests to the HyperRAM controller. + // + // If this port performs write coalescing, these commands may be modified or suppressed by the + // `hyperram_wrbuf` instance below. + wire [HyperRAMAddrW-1:ABIT] cmd_mem_addr = tl_i.a_address[HyperRAMAddrW-1:ABIT]; + wire [Log2BurstLen-ABIT:0] cmd_rd_len = {1'b1, {(Log2BurstLen-ABIT){1'b0}}}; // Full burst. + wire [Log2BurstLen-ABIT:0] cmd_wr_len = {{(Log2BurstLen-ABIT){1'b0}}, 1'b1}; // Single word. + wire [Log2BurstLen-ABIT:0] cmd_word_cnt = (tl_i.a_opcode == Get) ? cmd_rd_len : cmd_wr_len; + // Write bursts are linear, reads wrap. Linear bursts are more beneficial to write coalescing, + // but read bursts are wrapping, so that the requested data may be returned as soon as possible. + wire cmd_wrap_not_incr = (tl_i.a_opcode == Get); + wire [SeqWidth-1:0] cmd_seq = rdbuf_seq; + + assign tag_cmd_wcap = tl_i.a_user.capability; + + // Write buffer performs basic write coalescing to produce larger write bursts. + if (SupportWrites && CoalesceWrites) begin : gen_write_buffer + // This logic sits between the TL-UL handling and the Command and Downstream + // Data FIFOs, modifying the traffic. + // It must also be aware of HyperRAM reads in order to flush out any buffered + // write data first in the event of a collision. + hyperram_wrbuf #( + .AW (HyperRAMAddrW), + .DW (top_pkg::TL_DW), + .DBW (top_pkg::TL_DBW), + .Log2BurstLen (Log2BurstLen), + .SeqWidth (SeqWidth) + ) u_writebuf( + .clk_i (clk_i), + .rst_ni (rst_ni), + + // Data to be written into the Downstream FIFO. + .dfifo_wr_full_o (dfifo_wr_full), + .dfifo_wr_strb_i (tl_i.a_mask), + .dfifo_wr_din_i (tl_i.a_data), + + // Input command requests for any TL-UL operation that could not be fully + // satisfied by the read buffer. + .cmd_req_i (cmd_req), + .cmd_wready_o (cmd_wready), + .cmd_mem_addr_i (cmd_mem_addr), + .cmd_word_cnt_i (cmd_word_cnt), + .cmd_wr_not_rd_i (cmd_wr_not_rd), + .cmd_seq_i (cmd_seq), + + // Modified write traffic to the Downstream FIFO. + .dfifo_wr_ena_o (dfifo_wr_ena_o), + .dfifo_wr_full_i (dfifo_wr_full_i), + .dfifo_wr_strb_o (dfifo_wr_strb_o), + .dfifo_wr_din_o (dfifo_wr_din_o), + + // Modified command requests to the HyperRAM controller. + .cmd_req_o (cmd_req_o), + .cmd_wready_i (cmd_wready_i), + .cmd_mem_addr_o (cmd_mem_addr_o), + .cmd_word_cnt_o (cmd_word_cnt_o), + .cmd_wr_not_rd_o (cmd_wr_not_rd_o), + .cmd_wrap_not_incr_o (cmd_wrap_not_incr_o), + .cmd_seq_o (cmd_seq_o) + ); + + // Write buffer logic controls the Downstream FIFO. + logic unused_wrbuf; + assign unused_wrbuf = ^{dfifo_wr_ena, cmd_wrap_not_incr}; + end else begin : gen_no_write_buffer + // Commands to the HyperRAM controller propagate unmodified. + assign cmd_req_o = cmd_req; + assign cmd_wready = cmd_wready_i; + assign cmd_mem_addr_o = cmd_mem_addr; + assign cmd_word_cnt_o = cmd_word_cnt; + assign cmd_wr_not_rd_o = cmd_wr_not_rd; + assign cmd_wrap_not_incr_o = cmd_wrap_not_incr; + assign cmd_seq_o = cmd_seq; + + // Data to be written into the Downstream FIFO. + assign dfifo_wr_ena_o = dfifo_wr_ena; + assign dfifo_wr_full = dfifo_wr_full_i; + assign dfifo_wr_strb_o = tl_i.a_mask; + assign dfifo_wr_din_o = tl_i.a_data; + end + + // Unused signals. + logic unused; + assign unused = ^{tl_i.a_param, tl_req_fifo_depth[0]}; + +endmodule + diff --git a/rtl/ip/hyperram/rtl/hbmc_tl_top.sv b/rtl/ip/hyperram/rtl/hbmc_tl_top.sv index 14333ada4..cc03631a1 100644 --- a/rtl/ip/hyperram/rtl/hbmc_tl_top.sv +++ b/rtl/ip/hyperram/rtl/hbmc_tl_top.sv @@ -59,6 +59,7 @@ module hbmc_tl_top import tlul_pkg::*; #( parameter [4:0] C_DQ1_IDELAY_TAPS_VALUE = 0, parameter [4:0] C_DQ0_IDELAY_TAPS_VALUE = 0, + parameter int unsigned NumPorts = 2, parameter integer HyperRAMSize = 1024 * 1024 // 1 MiB ) ( @@ -70,8 +71,8 @@ module hbmc_tl_top import tlul_pkg::*; #( input clk_iserdes, input clk_idelay_ref, - input tl_h2d_t tl_i, - output tl_d2h_t tl_o, + input tl_h2d_t tl_i[NumPorts], + output tl_d2h_t tl_o[NumPorts], /* HyperBus Interface Port */ output wire hb_ck_p, @@ -82,28 +83,66 @@ module hbmc_tl_top import tlul_pkg::*; #( inout wire [7:0] hb_dq ); - localparam integer HyperRAMAddrW = $clog2(HyperRAMSize); + // Maximum number of data buffers per port. + localparam int unsigned MaxBufs = 4; + + // Two TL-UL access ports. + localparam int unsigned PortD = 0; + localparam int unsigned PortI = 1; + + // Width of port ID numbers, in bits. + localparam int unsigned PortIDWidth = $clog2(NumPorts); + localparam int unsigned Log2MaxBufs = $clog2(MaxBufs); + // Up to 4 outstanding requests from a single buffer +1 for invalidation, plus + // bits to identify the buffer number, and a further bit for the port number of + // the requester. + localparam int unsigned SeqWidth = PortIDWidth + Log2MaxBufs + 3; + + localparam int unsigned HyperRAMAddrW = $clog2(HyperRAMSize); + // LSB of word address. + localparam int unsigned ABIT = $clog2(top_pkg::TL_DW / 8); + // Use 32-byte bursts for performance, whilst reducing the penalty of wasted burst reads. + localparam int unsigned Log2BurstLen = 5; /*----------------------------------------------------------------------------------------------------------------------------*/ + /* We need the Upstream FIFO from the HyperRAM controller core to accommodate an entire + burst read. Upstream transfers write 16 bits into the upstream FIFO every cycle, but the + Sonata system clock is only 40% of that clock frequency. Additionally, because of the CDC + into the slower clock domain, it can take 4 system clock cycles to collect the first word. + */ + localparam int unsigned UDataWidth = top_pkg::TL_DW; + localparam int unsigned UFIFODepth = 1 << (Log2BurstLen - ABIT); + + /* The Downstream FIFO to the HyperRAM controller must be wide enough and deep enough to + * accommodate all of the write data for a burst. The write coealescing logic in `hyperram_wrbuf` + * relies upon being to push data words before issuing the write command, and once the write + * command is accepted, data will be popped faster than the system clock can supply it. + */ + localparam int unsigned DDataWidth = top_pkg::TL_DW; + localparam int unsigned DFIFODepth = 1 << (Log2BurstLen - ABIT); + logic idelayctrl_rdy_sync; logic clk_idelay; + /* Tag memory interface */ + logic tag_cmd_req; + logic tag_rdata_rready; /* HBMC command interface */ - logic tag_cmd_req, tag_cmd_wready; - logic cmd_wvalid, cmd_wready; - logic [31:0] cmd_mem_addr; - logic [15:0] cmd_word_cnt; - logic cmd_wr_not_rd; - logic cmd_wrap_not_incr; - logic cmd_ack; + logic [NumPorts-1:0] cmd_req; + logic [NumPorts-1:0] cmd_wready; + logic [NumPorts-1:0][HyperRAMAddrW-1:ABIT] cmd_mem_addr; + logic [NumPorts-1:0][Log2BurstLen-ABIT:0] cmd_word_cnt; // Bus words. + logic [NumPorts-1:0] cmd_wr_not_rd; + logic [NumPorts-1:0] cmd_wrap_not_incr; + logic [NumPorts-1:0][SeqWidth-1:0] cmd_seq; /* Upstream FIFO wires */ logic [15:0] ufifo_wr_data; logic ufifo_wr_last; logic ufifo_wr_ena; - logic [top_pkg::TL_DW-1:0] ufifo_rd_dout; + logic [UDataWidth-1:0] ufifo_rd_dout; logic [9:0] ufifo_rd_free; logic ufifo_rd_last; logic ufifo_rd_ena; @@ -111,13 +150,13 @@ module hbmc_tl_top import tlul_pkg::*; #( /* Downstream FIFO wires */ - logic [15:0] dfifo_rd_data; - logic [1:0] dfifo_rd_strb; - logic dfifo_rd_ena; - logic [top_pkg::TL_DW-1:0] dfifo_wr_din; - logic [top_pkg::TL_DW/8-1:0] dfifo_wr_strb; - logic dfifo_wr_ena; - logic dfifo_wr_full; + logic [15:0] dfifo_rd_data; + logic [1:0] dfifo_rd_strb; + logic dfifo_rd_ena; + logic [DDataWidth-1:0] dfifo_wr_din; + logic [DDataWidth/8-1:0] dfifo_wr_strb; + logic dfifo_wr_ena; + logic dfifo_wr_full; /*----------------------------------------------------------------------------------------------------------------------------*/ @@ -178,151 +217,215 @@ module hbmc_tl_top import tlul_pkg::*; #( endgenerate /*----------------------------------------------------------------------------------------------------------------------------*/ + // FIFOs on the TL-UL ports break a combinatorial loop between the instruction fetch and LSU ports + // of the CPU since the HyperRAM is presented to both ports, but without adding latency to the + // request or response. + tlul_pkg::tl_h2d_t tl_i_int[NumPorts]; + tlul_pkg::tl_d2h_t tl_o_int[NumPorts]; + + for (genvar p = 0; p < NumPorts; p++) begin : gen_tlul_fifos + tlul_fifo_sync #( + .ReqPass (1'b1), // Do not add latency. + .RspPass (1'b1), + .ReqDepth (1), // No need for more than a single slot. + .RspDepth (1) + ) u_tl_fifo( + .clk_i (clk_i), + .rst_ni (rst_ni), + .tl_h_i (tl_i[p]), + .tl_h_o (tl_o[p]), + .tl_d_o (tl_i_int[p]), + .tl_d_i (tl_o_int[p]), + .spare_req_i ('0), + .spare_req_o (), + .spare_rsp_i ('0), + .spare_rsp_o () + ); + end : gen_tlul_fifos - // Metadata from inbound tilelink transactions that needs to be saved to produce the response - typedef struct packed { - logic [top_pkg::TL_AIW-1:0] tl_source; - logic [top_pkg::TL_SZW-1:0] tl_size; - logic cmd_wr_not_rd; - } tl_req_info_t; - - tl_req_info_t tl_req_fifo_wdata, tl_req_fifo_rdata; - - logic tl_req_fifo_wvalid, tl_req_fifo_wready; - logic tl_req_fifo_rvalid, tl_req_fifo_rready; - - logic tl_tag_bit, tl_a_ready; - - tl_d2h_t tl_o_int; - - // Logic for handling incoming tilelink requests - always_comb begin - cmd_wvalid = 1'b0; - cmd_wr_not_rd = 1'b0; - tag_cmd_req = 1'b0; - dfifo_wr_ena = 1'b0; - tl_req_fifo_wvalid = 1'b0; - tl_a_ready = 1'b0; - - if (tl_i.a_valid && tl_req_fifo_wready && tag_cmd_wready && cmd_wready && - (tl_i.a_opcode == Get || ~dfifo_wr_full)) begin - // We can accept an incoming tilelink transaction when we've got space in the hyperram, tag - // and tilelink request FIFOs. If we're taking in a write transaction we also need space in the - // downstream FIFO(dfifo) for the write data - - // Write to the relevant FIFOs and indicate ready on tilelink A channel - cmd_wvalid = 1'b1; - tag_cmd_req = 1'b1; - tl_req_fifo_wvalid = 1'b1; - tl_a_ready = 1'b1; - - if (tl_i.a_opcode != Get) begin - cmd_wr_not_rd = 1'b1; - dfifo_wr_ena = 1'b1; - end - end +/*----------------------------------------------------------------------------------------------------------------------------*/ + // TL-UL Access ports. + + logic [SeqWidth-1:0] ufifo_rd_seq; + + logic [NumPorts-1:0] ufifo_all_rd_ena; + + logic [NumPorts-1:0][DDataWidth-1:0] dfifo_all_wr_din; + logic [NumPorts-1:0][DDataWidth/8-1:0] dfifo_all_wr_strb; + logic [NumPorts-1:0] dfifo_all_wr_ena; + logic [NumPorts-1:0] dfifo_all_wr_full; + + logic [NumPorts-1:0] tag_all_cmd_req; + logic [NumPorts-1:0][HyperRAMAddrW-1:ABIT] tag_all_cmd_mem_addr; + logic [NumPorts-1:0] tag_all_cmd_wr_not_rd; + logic [NumPorts-1:0] tag_all_rdata_rready; + + logic tl_tag_bit; + logic [NumPorts-1:0] tag_all_cmd_wcap; + + // Write notifications. + logic [NumPorts-1:0] wr_notify_out; + logic [NumPorts-1:0] wr_notify_in; + logic [NumPorts-1:0][HyperRAMAddrW-1:ABIT] wr_notify_addr_out; + logic [NumPorts-1:0][HyperRAMAddrW-1:ABIT] wr_notify_addr_in; + logic [NumPorts-1:0][top_pkg::TL_DBW-1:0] wr_notify_mask_out; + logic [NumPorts-1:0][top_pkg::TL_DBW-1:0] wr_notify_mask_in; + logic [NumPorts-1:0][top_pkg::TL_DW-1:0] wr_notify_data_out; + logic [NumPorts-1:0][top_pkg::TL_DW-1:0] wr_notify_data_in; + + if (NumPorts > 1) begin : gen_wr_notify + // Instruction port requires notifications of writes occurring on the Data port. + assign wr_notify_in[PortI] = wr_notify_out[PortD]; + assign wr_notify_addr_in[PortI] = wr_notify_addr_out[PortD]; + assign wr_notify_mask_in[PortI] = wr_notify_mask_out[PortD]; + assign wr_notify_data_in[PortI] = wr_notify_data_out[PortD]; + // Writes shall not occur on the Instruction port. + assign {wr_notify_in[PortD], wr_notify_addr_in[PortD]} = '0; + end else begin : gen_no_wr_notify + assign wr_notify_in = '0; + assign wr_notify_addr_in = '0; + assign wr_notify_mask_in = '0; + assign wr_notify_data_in = '0; end - assign dfifo_wr_strb = tl_i.a_mask; - assign dfifo_wr_din = tl_i.a_data; - - assign tl_req_fifo_wdata = '{ - tl_source : tl_i.a_source, - tl_size : tl_i.a_size, - cmd_wr_not_rd : cmd_wr_not_rd - }; - - // Logic for sending out tilelink responses - always_comb begin - tl_o_int = '0; - tl_req_fifo_rready = 1'b0; - ufifo_rd_ena = 1'b0; - - if (tl_req_fifo_rvalid) begin - // We have an incoming request that needs a response - if (tl_req_fifo_rdata.cmd_wr_not_rd) begin - // If it's a write then return an immediate response (early response is reasonable as any - // read that could observe the memory cannot occur until the write has actually happened) - tl_o_int.d_valid = 1'b1; - tl_req_fifo_rready = tl_i.d_ready; - end else begin - // Otherwise wait until we have read data to return - tl_o_int.d_valid = ~ufifo_rd_empty; - // Only dequeue read data from the upstream FIFO (ufifo) and request FIFO when the tilelink - // D channel is ready - ufifo_rd_ena = tl_i.d_ready & ~ufifo_rd_empty; - tl_req_fifo_rready = ufifo_rd_ena; + for (genvar p = 0; p < NumPorts; p++) begin : gen_ports + hbmc_tl_port #( + .HyperRAMAddrW (HyperRAMAddrW), + .Log2BurstLen (Log2BurstLen), + .NumBufs (MaxBufs), + .PortIDWidth (PortIDWidth), + .Log2MaxBufs (Log2MaxBufs), + .SeqWidth (SeqWidth), + .SupportWrites (p == PortD) // Only the data port supports writing. + ) u_port( + .clk_i (clk_i), + .rst_ni (rst_ni), + + // Port numbers. + .portid_i (PortIDWidth'(p)), + + // TL-UL interface. + .tl_i (tl_i_int[p]), + .tl_o (tl_o_int[p]), + + // Write notification input. + .wr_notify_i (wr_notify_in[p]), + .wr_notify_addr_i (wr_notify_addr_in[p]), + .wr_notify_mask_i (wr_notify_mask_in[p]), + .wr_notify_data_i (wr_notify_data_in[p]), + + // Write notification output. + .wr_notify_o (wr_notify_out[p]), + .wr_notify_addr_o (wr_notify_addr_out[p]), + .wr_notify_mask_o (wr_notify_mask_out[p]), + .wr_notify_data_o (wr_notify_data_out[p]), + + // Command data to the HyperRAM controller. + .cmd_req_o (cmd_req[p]), + .cmd_wready_i (cmd_wready[p]), + .cmd_mem_addr_o (cmd_mem_addr[p]), + .cmd_word_cnt_o (cmd_word_cnt[p]), + .cmd_wr_not_rd_o (cmd_wr_not_rd[p]), + .cmd_wrap_not_incr_o(cmd_wrap_not_incr[p]), + .cmd_seq_o (cmd_seq[p]), + .tag_cmd_req (tag_all_cmd_req[p]), + .tag_cmd_mem_addr (tag_all_cmd_mem_addr[p]), + .tag_cmd_wr_not_rd (tag_all_cmd_wr_not_rd[p]), + .tag_cmd_wcap (tag_all_cmd_wcap[p]), + .dfifo_wr_ena_o (dfifo_all_wr_ena[p]), + .dfifo_wr_full_i (dfifo_all_wr_full[p]), + .dfifo_wr_strb_o (dfifo_all_wr_strb[p]), + .dfifo_wr_din_o (dfifo_all_wr_din[p]), + + // Read data from the HyperRAM controller. + .ufifo_rd_ena (ufifo_all_rd_ena[p]), + .ufifo_rd_empty (ufifo_rd_empty), + .ufifo_rd_dout (ufifo_rd_dout), + .ufifo_rd_seq (ufifo_rd_seq), + .ufifo_rd_last (ufifo_rd_last), + + // Tag read data interface. + .tag_rdata_rready (tag_all_rdata_rready[p]), + .tl_tag_bit (tl_tag_bit) + ); + end : gen_ports + + // Upstream FIFO traffic is presented to all ports, but only the one indicated by the sequence + // number within the FIFO entry shall consume it. + assign ufifo_rd_ena = |ufifo_all_rd_ena; + + // Downstream FIFO traffic comes from the Data port, since this is the only port that performs + // writes. + assign dfifo_wr_ena = dfifo_all_wr_ena[PortD]; + assign dfifo_wr_strb = dfifo_all_wr_strb[PortD]; + assign dfifo_wr_din = dfifo_all_wr_din[PortD]; + assign dfifo_all_wr_full = {NumPorts{dfifo_wr_full}}; + + // Only the Data port requires tag bits. + assign tag_cmd_req = tag_all_cmd_req[PortD]; + assign tag_rdata_rready = tag_all_rdata_rready[PortD]; + +/*----------------------------------------------------------------------------------------------------------------------------*/ + // Arbitrate amongst the access ports. + localparam BUS_SYNC_WIDTH = (HyperRAMAddrW - ABIT) // Address, in terms of TL-UL bus words. + + (Log2BurstLen + 1 - ABIT) // Number of TL-UL bus words. + + 1 + 1 + SeqWidth; // Write/Read, Wrap/linear, Sequence no. + logic cmd_fifo_wvalid; + logic cmd_fifo_wready; + logic [BUS_SYNC_WIDTH-1:0] cmd_fifo_wdata; + + if (NumPorts > 1) begin : gen_multi_port + logic [BUS_SYNC_WIDTH-1:0] cmd_wdata[NumPorts]; + always_comb begin + for (int unsigned p = 0; p < NumPorts; p++) begin + cmd_wdata[p] = {cmd_mem_addr[p], cmd_word_cnt[p], cmd_wr_not_rd[p], cmd_wrap_not_incr[p], cmd_seq[p]}; end end - tl_o_int.d_opcode = tl_req_fifo_rdata.cmd_wr_not_rd ? AccessAck : AccessAckData; - tl_o_int.d_size = tl_req_fifo_rdata.tl_size; - tl_o_int.d_source = tl_req_fifo_rdata.tl_source; - tl_o_int.d_data = ufifo_rd_dout; - tl_o_int.d_user.capability = tl_tag_bit; - tl_o_int.a_ready = tl_a_ready; + prim_arbiter_fixed #( + .N (NumPorts), + .DW (BUS_SYNC_WIDTH) + ) u_cmd_arbiter( + .clk_i (clk_i), + .rst_ni (rst_ni), + + .req_i (cmd_req), + .data_i (cmd_wdata), + .gnt_o (cmd_wready), + .idx_o (), // Not used. + + .valid_o (cmd_fifo_wvalid), + .data_o (cmd_fifo_wdata), + .ready_i (cmd_fifo_wready) + ); + end else begin : gen_single_port + assign cmd_fifo_wvalid = cmd_req & cmd_fifo_wready; + assign cmd_wready = cmd_fifo_wready; + assign cmd_fifo_wdata = {cmd_mem_addr, cmd_word_cnt, cmd_wr_not_rd, cmd_wrap_not_incr, cmd_seq}; end - // Generate integrity for outgoing response - tlul_rsp_intg_gen #( - .EnableRspIntgGen(0), - .EnableDataIntgGen(0) - ) u_tlul_rsp_intg_gen ( - .tl_i(tl_o_int), - .tl_o(tl_o) - ); - - localparam TL_REQ_FIFO_DEPTH = 4; - - prim_fifo_sync #( - .Width($bits(tl_req_info_t)), - .Depth(TL_REQ_FIFO_DEPTH), - .Pass(1'b0) - ) u_tl_req_fifo ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .clr_i (1'b0), - .wvalid_i (tl_req_fifo_wvalid), - .wready_o (tl_req_fifo_wready), - .wdata_i (tl_req_fifo_wdata), - .rvalid_o (tl_req_fifo_rvalid), - .rready_i (tl_req_fifo_rready), - .rdata_o (tl_req_fifo_rdata), - - .full_o (), - .depth_o (), - .err_o () - ); - - assign cmd_mem_addr = {{(33 - HyperRAMAddrW){1'b0}}, tl_i.a_address[HyperRAMAddrW-1:1]}; - assign cmd_word_cnt = 16'd2; - assign cmd_wrap_not_incr = 1'b0; - /*----------------------------------------------------------------------------------------------------------------------------*/ - localparam BUS_SYNC_WIDTH = 32 + 16 + 1 + 1; - - logic cmd_rvalid, cmd_rready; - logic [31:0] cmd_mem_addr_dst; - logic [15:0] cmd_word_cnt_dst; - logic cmd_wr_not_rd_dst; - logic cmd_wrap_not_incr_dst; - + logic cmd_rvalid, cmd_rready; + logic [HyperRAMAddrW-1:ABIT] cmd_mem_addr_dst; + logic [Log2BurstLen:ABIT] cmd_word_cnt_dst; + logic cmd_wr_not_rd_dst; + logic cmd_wrap_not_incr_dst; + logic [SeqWidth-1:0] cmd_seq_dst; - logic [BUS_SYNC_WIDTH - 1:0] cmd_wdata, cmd_rdata; + logic [BUS_SYNC_WIDTH-1:0] cmd_rdata; - assign cmd_wdata = {cmd_mem_addr, cmd_word_cnt, cmd_wr_not_rd, cmd_wrap_not_incr}; - assign {cmd_mem_addr_dst, cmd_word_cnt_dst, cmd_wr_not_rd_dst, cmd_wrap_not_incr_dst} = cmd_rdata; + assign {cmd_mem_addr_dst, cmd_word_cnt_dst, cmd_wr_not_rd_dst, cmd_wrap_not_incr_dst, cmd_seq_dst} = cmd_rdata; prim_fifo_async #( .Width(BUS_SYNC_WIDTH), - .Depth(2) + .Depth(4) ) u_hbmc_cmd_fifo ( .clk_wr_i (clk_i), .rst_wr_ni(rst_ni), - .wvalid_i (cmd_wvalid), - .wready_o (cmd_wready), - .wdata_i (cmd_wdata), + .wvalid_i (cmd_fifo_wvalid), + .wready_o (cmd_fifo_wready), + .wdata_i (cmd_fifo_wdata), .wdepth_o (), .clk_rd_i (clk_hbmc_0), @@ -335,6 +438,12 @@ module hbmc_tl_top import tlul_pkg::*; #( /*----------------------------------------------------------------------------------------------------------------------------*/ + // Widened address and word count, converting from TL-UL bus words to 16-bit HyperRAM words. + logic [31:0] cmd_mem_addr_full; + logic [15:0] cmd_word_cnt_full; + assign cmd_mem_addr_full = 32'({cmd_mem_addr_dst, {(ABIT-1){1'b0}}}); + assign cmd_word_cnt_full = 16'({cmd_word_cnt_dst, {(ABIT-1){1'b0}}}); + hbmc_ctrl # ( .C_AXI_DATA_WIDTH ( top_pkg::TL_DW ), @@ -378,8 +487,8 @@ module hbmc_tl_top import tlul_pkg::*; #( .cmd_valid ( cmd_rvalid ), .cmd_ready ( cmd_rready ), - .cmd_mem_addr ( cmd_mem_addr_dst ), - .cmd_word_count ( cmd_word_cnt_dst ), + .cmd_mem_addr ( cmd_mem_addr_full ), + .cmd_word_count ( cmd_word_cnt_full ), .cmd_wr_not_rd ( cmd_wr_not_rd_dst ), .cmd_wrap_not_incr ( cmd_wrap_not_incr_dst ), @@ -401,27 +510,36 @@ module hbmc_tl_top import tlul_pkg::*; #( /*----------------------------------------------------------------------------------------------------------------------------*/ + /* Return the sequence number from the command so that the read data may be steered appropriately */ + logic [SeqWidth-1:0] ufifo_wr_seq; + always_ff @(posedge clk_hbmc_0) begin + if (cmd_rvalid & cmd_rready) ufifo_wr_seq <= cmd_seq_dst; + end + /* Upstream data FIFO */ hbmc_ufifo # ( - .DATA_WIDTH ( top_pkg::TL_DW ) + .DataWidth ( UDataWidth ), + .FIFODepth ( UFIFODepth ), + .SeqWidth ( SeqWidth ) ) hbmc_ufifo_inst ( - .fifo_wr_clk ( clk_hbmc_0 ), - .fifo_wr_nrst ( rst_hbmc_ni ), - .fifo_wr_din ( ufifo_wr_data ), - .fifo_wr_last ( ufifo_wr_last ), - .fifo_wr_ena ( ufifo_wr_ena ), - .fifo_wr_full ( /*----NC----*/ ), - - .fifo_rd_clk ( clk_i ), - .fifo_rd_nrst ( rst_ni ), - .fifo_rd_dout ( ufifo_rd_dout ), - .fifo_rd_free ( /*----NC----*/ ), - .fifo_rd_last ( ufifo_rd_last ), - .fifo_rd_ena ( ufifo_rd_ena ), - .fifo_rd_empty ( ufifo_rd_empty ) + .fifo_wr_clk ( clk_hbmc_0 ), + .fifo_wr_nrst ( rst_hbmc_ni ), + .fifo_wr_din ( ufifo_wr_data ), + .fifo_wr_seq ( ufifo_wr_seq ), + .fifo_wr_last ( ufifo_wr_last ), + .fifo_wr_ena ( ufifo_wr_ena ), + .fifo_wr_full ( /*----NC----*/ ), + + .fifo_rd_clk ( clk_i ), + .fifo_rd_nrst ( rst_ni ), + .fifo_rd_dout ( ufifo_rd_dout ), + .fifo_rd_seq ( ufifo_rd_seq ), + .fifo_rd_last ( ufifo_rd_last ), + .fifo_rd_ena ( ufifo_rd_ena ), + .fifo_rd_empty ( ufifo_rd_empty ) ); /*----------------------------------------------------------------------------------------------------------------------------*/ @@ -429,7 +547,8 @@ module hbmc_tl_top import tlul_pkg::*; #( /* Downstream data FIFO */ hbmc_dfifo # ( - .DATA_WIDTH ( top_pkg::TL_DW ) + .DataWidth ( DDataWidth ), + .FIFODepth ( DFIFODepth ) ) hbmc_dfifo_inst ( @@ -465,41 +584,22 @@ module hbmc_tl_top import tlul_pkg::*; #( logic wdata; } tag_cmd_t; - tag_cmd_t tag_cmd_in, tag_cmd_out; - logic tag_cmd_valid; - logic tag_rdata; logic tag_rdata_valid_q, tag_rdata_valid_d; logic tag_rdata_fifo_rvalid; - assign tag_cmd_in.addr = cmd_mem_addr[TAG_ADDR_W + 1:2]; - assign tag_cmd_in.write = cmd_wr_not_rd; - assign tag_cmd_in.wdata = tl_i.a_user.capability; - - prim_fifo_sync #( - .Width($bits(tag_cmd_t)), - .Depth(TAG_FIFO_DEPTH), - .Pass(1'b0) - ) u_tag_cmd_fifo ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .clr_i (1'b0), - .wvalid_i (tag_cmd_req), - .wready_o (tag_cmd_wready), - .wdata_i (tag_cmd_in), - .rvalid_o (tag_cmd_valid), - .rready_i (1'b1), - .rdata_o (tag_cmd_out), - - .full_o (), - .depth_o (), - .err_o () - ); + // Only the Data port requires capability tags. + tag_cmd_t tag_cmd; + assign tag_cmd.addr = tag_all_cmd_mem_addr[PortD][HyperRAMAddrW-1:ABIT+1]; + assign tag_cmd.write = tag_all_cmd_wr_not_rd[PortD]; + assign tag_cmd.wdata = tag_all_cmd_wcap[PortD]; prim_fifo_sync #( .Width(1), .Depth(TAG_FIFO_DEPTH), - .Pass(1'b0) + // Pass through is required when a Read operation hits in the internal buffer, since that + // takes only a single cycle. + .Pass(1'b1) ) u_tag_rdata_fifo ( .clk_i (clk_i), .rst_ni (rst_ni), @@ -508,7 +608,7 @@ module hbmc_tl_top import tlul_pkg::*; #( .wready_o (), .wdata_i (tag_rdata), .rvalid_o (tag_rdata_fifo_rvalid), - .rready_i (ufifo_rd_ena), + .rready_i (tag_rdata_rready), .rdata_o (tl_tag_bit), .full_o (), @@ -516,11 +616,12 @@ module hbmc_tl_top import tlul_pkg::*; #( .err_o () ); - `ASSERT(always_tag_rdata_valid_when_read_data_response, ufifo_rd_ena |-> tag_rdata_fifo_rvalid) + // TODO: Probably wants some refinement/augmentation. + // `ASSERT(always_tag_rdata_valid_when_read_data_response, ufifo_rd_ena |-> tag_rdata_fifo_rvalid) - assign tag_rdata_valid_d = tag_cmd_valid & ~tag_cmd_out.write; + assign tag_rdata_valid_d = tag_cmd_req & ~tag_cmd.write; - always @(posedge clk_i, negedge rst_ni) begin + always_ff @(posedge clk_i, negedge rst_ni) begin if (~rst_ni) begin tag_rdata_valid_q <= 1'b0; end else begin @@ -533,10 +634,10 @@ module hbmc_tl_top import tlul_pkg::*; #( .Depth(2 ** TAG_ADDR_W) ) u_tag_ram ( .clk_i (clk_i), - .req_i (tag_cmd_valid), - .write_i (tag_cmd_out.write), - .addr_i (tag_cmd_out.addr), - .wdata_i (tag_cmd_out.wdata), + .req_i (tag_cmd_req), + .write_i (tag_cmd.write), + .addr_i (tag_cmd.addr), + .wdata_i (tag_cmd.wdata), .wmask_i ('1), .rdata_o (tag_rdata), .cfg_i ('0) diff --git a/rtl/ip/hyperram/rtl/hbmc_ufifo.sv b/rtl/ip/hyperram/rtl/hbmc_ufifo.sv index 8995f06f9..69438c5d0 100644 --- a/rtl/ip/hyperram/rtl/hbmc_ufifo.sv +++ b/rtl/ip/hyperram/rtl/hbmc_ufifo.sv @@ -2,31 +2,31 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -// Reimplementation of hbmc_ufifo using OpenTitan primitives, only works for DATA_WIDTH == 32 - -module hbmc_ufifo # -( - parameter integer DATA_WIDTH = 32 -) -( +// Reimplementation of hbmc_ufifo using OpenTitan primitives, only works for DataWidth == 32 +module hbmc_ufifo #( + parameter int unsigned DataWidth = 32, // Width of data word, bits. + parameter int unsigned FIFODepth = 8, // Depth of FIFO, entries. + parameter int unsigned SeqWidth = 4 // Width of sequence number, bits. +) ( input wire fifo_wr_clk, input wire fifo_wr_nrst, input wire [15:0] fifo_wr_din, + input wire [SeqWidth-1:0] fifo_wr_seq, input wire fifo_wr_last, input wire fifo_wr_ena, output wire fifo_wr_full, input wire fifo_rd_clk, input wire fifo_rd_nrst, - output wire [DATA_WIDTH - 1:0] fifo_rd_dout, - output wire [9:0] fifo_rd_free, + output wire [DataWidth-1:0] fifo_rd_dout, + output wire [SeqWidth-1:0] fifo_rd_seq, output wire fifo_rd_last, input wire fifo_rd_ena, output wire fifo_rd_empty ); - // FIFO contains 32-bit data word and 1 'last' bit - localparam int unsigned FIFOWidth = DATA_WIDTH + 1; + // FIFO contains 32-bit data word, 4 'sequence' bits and 1 'last' bit + localparam int unsigned FIFOWidth = DataWidth + SeqWidth + 1; logic [FIFOWidth-1:0] fifo_wdata, fifo_rdata; logic [15:0] fifo_wdata_first_half; @@ -37,7 +37,7 @@ module hbmc_ufifo # assign fifo_wr_full = ~fifo_wready; assign fifo_rd_empty = ~fifo_rvalid; - assign fifo_wdata = {fifo_wr_last, fifo_wr_din, fifo_wdata_first_half}; + assign fifo_wdata = {fifo_wr_last, fifo_wr_seq, fifo_wr_din, fifo_wdata_first_half}; assign fifo_wvalid = fifo_wdata_half_sel & fifo_wr_ena; always @(posedge fifo_wr_clk or negedge fifo_wr_nrst) begin @@ -56,7 +56,7 @@ module hbmc_ufifo # prim_fifo_async #( .Width(FIFOWidth), - .Depth(4) + .Depth(FIFODepth) ) u_fifo ( .clk_wr_i(fifo_wr_clk), .rst_wr_ni(fifo_wr_nrst), @@ -73,20 +73,13 @@ module hbmc_ufifo # .rdepth_o() ); - // fifo_rd_free output is unused in hyperram top-level - assign fifo_rd_free = '0; - - assign fifo_rd_dout = fifo_rdata[31:0]; - assign fifo_rd_last = fifo_rdata[32]; + assign {fifo_rd_last, fifo_rd_seq, fifo_rd_dout} = fifo_rdata; initial begin - if (DATA_WIDTH != 32) begin - $fatal("hbmc_ufifo only supports DATA_WIDTH of 32"); + if (DataWidth != 32) begin + $fatal("hbmc_ufifo only supports DataWidth of 32"); end end endmodule -/*----------------------------------------------------------------------------------------------------------------------------*/ - -`default_nettype wire diff --git a/rtl/ip/hyperram/rtl/hyperram.sv b/rtl/ip/hyperram/rtl/hyperram.sv index 82b8ed778..bccbc5620 100644 --- a/rtl/ip/hyperram/rtl/hyperram.sv +++ b/rtl/ip/hyperram/rtl/hyperram.sv @@ -7,9 +7,12 @@ // It also provides an SRAM model implementation for use in simulations that // don't want to include the full hyperram controller RTL and BFM (which in // particular require Xilinx encrypted IP models). + module hyperram import tlul_pkg::*; #( - parameter HyperRAMClkFreq = 100_000_000, - parameter HyperRAMSize = 1024 * 1024 + parameter HyperRAMClkFreq = 200_000_000, + parameter HyperRAMSize = 1024 * 1024, + // Number of access ports. + parameter int unsigned NumPorts = 2 ) ( input clk_i, input rst_ni, @@ -19,8 +22,8 @@ module hyperram import tlul_pkg::*; #( input clk_hr3x_i, input rst_hr_ni, - input tl_h2d_t tl_i, - output tl_d2h_t tl_o, + input tl_h2d_t tl_i[NumPorts], + output tl_d2h_t tl_o[NumPorts], inout wire [7:0] hyperram_dq, inout wire hyperram_rwds, @@ -29,15 +32,13 @@ module hyperram import tlul_pkg::*; #( output wire hyperram_nrst, output wire hyperram_cs ); -`ifdef USE_HYPERRAM_SIM_MODEL +`ifdef USE_HYPERRAM_SRAM_MODEL + // This is a simple SRAM implementation that may be used in simulation if modelling of the + // behaviour/timing of the HyperRAM is not required. It is also required in synthesis for + // the Sonata XL board because that has no HyperRAM. localparam int SRAMModelAddrWidth = $clog2(HyperRAMSize); localparam int UnusedParams = HyperRAMClkFreq + HyperRAMSize; - tl_h2d_t unused_tl_b; - assign unused_tl_b = '0; - - logic [7:0] unused_hyperram_dq; - logic unused_hyperram_rwds; logic unused_clk_hr; logic unused_clk_hr90p; logic unused_clk_hr3x; @@ -53,24 +54,41 @@ module hyperram import tlul_pkg::*; #( assign unused_clk_hr3x = clk_hr3x_i; assign unused_rst_hr = rst_hr_ni; - // TODO: Consider adding extra latency to roughly model the performance of the - // real hyperram controller - sram #( - .AddrWidth ( SRAMModelAddrWidth ), - .DataWidth ( 32 ), - .DataBitsPerMask ( 8 ) - ) u_hyperram_model ( - .clk_i, - .rst_ni, + if (NumPorts > 1) begin : gen_dual_port + // Dual-ported SRAM supports the LSU and Instruction Fetching. + sram #( + .AddrWidth ( SRAMModelAddrWidth ), + .DataWidth ( 32 ), + .DataBitsPerMask ( 8 ) + ) u_hyperram_model ( + .clk_i, + .rst_ni, - .tl_a_i (tl_i), - .tl_a_o (tl_o), + .tl_a_i (tl_i[0]), + .tl_a_o (tl_o[0]), - .tl_b_i (unused_tl_b), - .tl_b_o () - ); + .tl_b_i (tl_i[1]), + .tl_b_o (tl_o[1]) + ); + end else begin : gen_single_port + // Single, shared port. + tl_h2d_t unused_tl_b; + assign unused_tl_b = '0; + sram #( + .AddrWidth ( SRAMModelAddrWidth ), + .DataWidth ( 32 ), + .DataBitsPerMask ( 8 ) + ) u_hyperram_model ( + .clk_i, + .rst_ni, + .tl_a_i (tl_i[0]), + .tl_a_o (tl_o[0]), + .tl_b_i (unused_tl_b), + .tl_b_o () + ); + end `else hbmc_tl_top #( .C_HBMC_CLOCK_HZ(HyperRAMClkFreq), @@ -101,6 +119,7 @@ module hyperram import tlul_pkg::*; #( .C_DQ1_IDELAY_TAPS_VALUE(0), .C_DQ0_IDELAY_TAPS_VALUE(0), .C_ISERDES_CLOCKING_MODE(0), + .NumPorts(NumPorts), .HyperRAMSize(HyperRAMSize) ) u_hbmc_tl_top ( .clk_i(clk_i), diff --git a/rtl/ip/hyperram/rtl/hyperram_rdbuf.sv b/rtl/ip/hyperram/rtl/hyperram_rdbuf.sv new file mode 100644 index 000000000..cd9f62aac --- /dev/null +++ b/rtl/ip/hyperram/rtl/hyperram_rdbuf.sv @@ -0,0 +1,283 @@ +// Copyright lowRISC contributors. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Read buffer retains the contents of a read burst, on the premise that subsequent words within +// the burst will be required by the CPU in the near future. +module hyperram_rdbuf #( + // System bus side. + parameter int unsigned AW = 20, // Width of address, bits. + parameter int unsigned DW = 32, // Width of data, bits. + parameter int unsigned DBW = 4, // Number of update strobes. + parameter int unsigned NumBufs = 4, // Number of read buffers. + parameter int unsigned PortIDWidth = 1, // Width of Port ID, bits. + parameter int unsigned Log2MaxBufs = 2, + parameter int unsigned SeqWidth = 6, // Width of sequence number, bits. + // Burst size of 32 bytes + parameter int unsigned BBIT = 5, // 32 bytes/burst. + + // LSB of word address. + localparam int unsigned ABIT = $clog2(DW / 8), + // Size of read buffer in words of 'DW' bits. + localparam int unsigned BufWords = 1 << (BBIT - ABIT) +) ( + input clk_i, + input rst_ni, + + // Constant indicating the port number. + input [PortIDWidth-1:0] portid_i, + + // Hit test for read/update access. + input [AW-1:ABIT] addr_i, + input [DBW-1:0] mask_i, + input [DW-1:0] data_i, + output matches_o, + output valid_o, + + // Write notification test. + input wr_notify_i, + input [AW-1:ABIT] wr_notify_addr_i, + input [DBW-1:0] wr_notify_mask_i, + input [DW-1:0] wr_notify_data_i, + output wr_matches_o, + + // Control of buffer content. + input invalidate_i, + input update_i, + input set_i, + output logic [SeqWidth-1:0] seq_o, + + // Reading from the buffer (System bus side). + input read_i, + output logic [DW-1:0] rdata_o, + + // Writing to the buffer (HyperRAM side). + input write_i, + input [SeqWidth-1:0] wseq_i, + input [DW-1:0] wdata_i +); + +// Round robin replacement of read buffers when all are occupied. +// +// TODO: Investigate any benefit from LRU instead? Since there are presently just 4 buffers +// per port, it is simple and inexpensive to keep a rank number for each of the buffers. +logic [NumBufs-1:0] buf_replace; +if (NumBufs > 1) begin : gen_round_robin_replace + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) buf_replace <= 'b1; + else if (set_i) buf_replace <= {buf_replace[NumBufs-2:0], buf_replace[NumBufs-1]}; + end +end else begin + assign buf_replace = 1'b0; +end + +logic [NumBufs-1:0] configured; + +// Decide upon the buffer to be replaced; if at least one is available, arbitrarily pick the +// lowest-numbered available buffer. If none is available then we use round-robin replacement. +wire [NumBufs-1:0] avail_lsb = ~configured & ~(~configured - 'b1); +wire [NumBufs-1:0] buf_set = &configured ? buf_replace : avail_lsb; + +// Validity bits for buffer words. +logic [NumBufs-1:0][BufWords-1:0] valid; + +// Base address of buffer contents. +logic [NumBufs-1:0][AW-1:BBIT] base_addr; + +// Individual responses from hit tests on the buffers. +logic [NumBufs-1:0] matches_all; +logic [NumBufs-1:0] valid_all; +logic [NumBufs-1:0] wr_matches_all; +logic [NumBufs-1:0] wr_valid_all; + +// Combined response. +assign matches_o = |matches_all; +assign valid_o = |valid_all; +assign wr_matches_o = |wr_matches_all; + +// A write notification that hits must be serviced immediately and with highest priority; it is +// a single-cycle event that either invalidates the affected buffer or updates its contents. +// +// The parent module is informed of the hit, so that it can hold off any system bus transaction +// occurring in the same cycle, but it does not need to know what action was taken. +// +// Internally we either invalidate the buffer, if there is no valid data available, or we perform +// an update of the buffer contents. +wire wr_valid = |wr_valid_all; +wire wr_notify_invalidate = wr_notify_i & wr_matches_o & ~wr_valid; +wire wr_notify_update = wr_notify_i & wr_matches_o & wr_valid; + +// Invalidate occurs when matching but not valid, and we need to know which valid signal to consult. +wire [NumBufs-1:0] invalidate = ({NumBufs{invalidate_i}} & matches_all) | + ({NumBufs{wr_notify_invalidate}} & wr_matches_all); +wire [NumBufs-1:0] set = {NumBufs{set_i}} & buf_set; + +// Hit test on buffer contents. +logic [BBIT-1:ABIT] a_offset; +assign a_offset = addr_i[BBIT-1:ABIT]; // Address bits selecting word within burst. + +// Return the bit index of the single bit set within the input; the output is undefined in the +// event of zero bits or more than one bit being set and shall not be used. +// This could only happen as a result of a design fault; it implies multiple buffers holding +// data for the same address. +function automatic logic [Log2MaxBufs-1:0] one_hot_enc(logic [NumBufs-1:0] in); + logic [Log2MaxBufs-1:0] out = 0; + for (int unsigned b = 0; b < Log2MaxBufs; b++) begin + for (int unsigned i = 0; i < NumBufs; i++) out[b] = out[b] | (in[i] & i[b]); + end + return out; +endfunction + +// Offset of write notification address within burst. +logic [BBIT-1:ABIT] wn_offset; +assign wn_offset = wr_notify_addr_i[BBIT-1:ABIT]; // Address bits selecting word within burst. + +// Offset of read or update from within burst; this is either a write notification that hits +// (top priority) or a write transaction that hits. +logic [BBIT-1:ABIT] ur_offset; +assign ur_offset = wr_notify_update ? wn_offset : a_offset; + +// Buffer to be read by the system bus, or updated by the system bus or write notification. +wire [NumBufs-1:0] ur_valid_all = wr_notify_update ? (wr_matches_all & wr_valid_all) : valid_all; +wire [Log2MaxBufs-1:0] ur_buf = one_hot_enc(ur_valid_all); + +// Updating of buffer contents. +wire update = update_i | wr_notify_update; +wire [DBW-1:0] umask = wr_notify_update ? wr_notify_mask_i : mask_i; +wire [DW-1:0] udata = wr_notify_update ? wr_notify_data_i : data_i; + +// When receiving data from the HyperRAM controller, it arrives tagged with the buffer number. +localparam int unsigned SeqBits = SeqWidth - PortIDWidth - Log2MaxBufs; +logic [Log2MaxBufs-1:0] wr_buf = wseq_i[SeqBits +: Log2MaxBufs]; +logic [NumBufs-1:0] wr_accepted; + +// Offset at which the next word of returned data shall be written, for each buffer; +// each buffer may have a single outstanding and still current request. +logic [NumBufs-1:0][BBIT-1:ABIT] woffset; + +// Sequence number is driven to non-zero only by the buffer which is about to be (re-)filled, +// so we can just OR all of the sequence numbers together. +logic [NumBufs-1:0][SeqWidth-1:0] seq_all; +always_comb begin + seq_o = 0; + for (int unsigned b = 0; b < NumBufs; b++) seq_o = seq_o | seq_all[b]; +end + +// The read buffer is capable of retaining a number of bursts of read data, so some of the +// control logic - chiefly the address matching - must be replicated for each of these +// internal buffers. +for (genvar b = 0; b < NumBufs; b++) begin : gen_buf_state + // Does the buffer have valid information? + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + configured[b] <= 1'b0; + end else if (invalidate[b] | set[b]) begin + configured[b] <= set[b]; + end + end + + // Updating of validity bits. + always_ff @(posedge clk_i) begin + // System bus-side changes in the buffer status take precedence over newly-received read data. + if (invalidate[b] | set[b]) begin + valid[b] <= 'b0; + end else if (wr_accepted[b]) begin + valid[b][woffset[b]] <= 1'b1; + end + end + + // Indicates that the R/W address matches within this read buffer. + assign matches_all[b] = &{configured[b], addr_i[AW-1:BBIT] == base_addr[b][AW-1:BBIT]}; + // Since these validity indicators are combined and returned to the parent as a single indication, + // we must qualify it here with `matches.` + assign valid_all[b] = matches_all[b] & valid[b][a_offset]; + + // Write notification test; this is done in parallel with normal read buffer access because + // normally writes on other TL-UL ports will not collide with buffered read data. + assign wr_matches_all[b] = &{configured[b], + wr_notify_addr_i[AW-1:BBIT] == base_addr[b][AW-1:BBIT]}; + // Since this `validity` indicator is used only internally there is no need to qualify it here + // with `matches.` + assign wr_valid_all[b] = valid[b][wn_offset]; + + // Sequence number for the buffer contents. + logic [2:0] next_seq; + logic [2:0] seq; + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) seq <= '0; + else if (set[b]) seq <= next_seq; + end + // A single bit suffices for the sequence number. + assign next_seq = seq + 'b1; + // When issuing a new burst read, it's the new sequence number that is required, so that we + // accept the returned data. Drive our sequence number to zero if we're not just starting + // a fill request, so that all sequence numbers can simply be ORed together. + assign seq_all[b] = {SeqWidth{set[b]}} & {portid_i, b[Log2MaxBufs-1:0], next_seq}; + + // Base address of buffer contents. + always_ff @(posedge clk_i) begin + if (set[b]) base_addr[b] <= addr_i[AW-1:BBIT]; + end + + // Writes are accepted only if the buffer is still configured and the write data belongs in + // the currently-buffered content. The port ID number has already been checked; only the + // buffer number and the sequence number matter here. + assign wr_accepted[b] = &{write_i, configured[wr_buf], wr_buf == b, + (wseq_i[SeqWidth-1-PortIDWidth-Log2MaxBufs:0] == seq)}; + + // Writing into the buffer. + always_ff @(posedge clk_i) begin + if (set[b]) begin + // Retain the offset of the first word that will be returned by the wrapping burst. + woffset[b] <= addr_i[BBIT-1:ABIT]; + end else begin + // Wrapping bursts are achieved by `woffset` overflowing at the end of the burst. + woffset[b] <= woffset[b] + {{(BBIT-ABIT-1){1'b0}}, wr_accepted[b]}; + end + end +end + +// RAM submodule wants a single write strobe per data line. +localparam int unsigned DataBitsPerMask = DW / DBW; +logic [DW-1:0] umask_full; +always_comb begin + for (int unsigned b = 0; b < DBW; b++) begin + umask_full[b*DataBitsPerMask +: DataBitsPerMask] = {DataBitsPerMask{umask[b]}}; + end +end + +// Use a dual-port implementation for simplicity because the design is targeting an FPGA +// implementation. Read-write collisions will be infrequent but we DO need to handle them. +prim_ram_2p #( + .Width (DW), + .Depth (NumBufs * BufWords), + .DataBitsPerMask (DataBitsPerMask) +) u_buf( + .clk_a_i (clk_i), + .clk_b_i (clk_i), + + // Read/update port (TL-UL side). + // - update and read shall not occur simultaneously; let update take precedence. + .a_req_i (update | read_i), + .a_write_i (update), + .a_addr_i ({ur_buf, ur_offset}), + .a_wdata_i (udata), + .a_wmask_i (umask_full), + .a_rdata_o (rdata_o), + + // Write port (HyperRAM side). + .b_req_i (write_i), + .b_write_i (1'b1), + .b_addr_i ({wr_buf, woffset[wr_buf]}), + .b_wdata_i (wdata_i), + .b_wmask_i ('1), + .b_rdata_o (), // Write-only port. + + .cfg_i ('0) +); + +logic unused; +assign unused = ^wseq_i; // The port number has already been checked in the parent. + +endmodule + diff --git a/rtl/ip/hyperram/rtl/hyperram_wrbuf.sv b/rtl/ip/hyperram/rtl/hyperram_wrbuf.sv new file mode 100644 index 000000000..3d73e6b8a --- /dev/null +++ b/rtl/ip/hyperram/rtl/hyperram_wrbuf.sv @@ -0,0 +1,277 @@ +// Copyright lowRISC contributors. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Write buffering collects single-word write transactions to coalesce them +// into larger bursts when possible. +// +// Basic design rules: +// +// 1. Contiguous ascending writes (including bytes and half words) are collected +// until we have reached the maximum burst length. +// 2. Up to two words of contiguous descending writes may be collected; this +// limitation is because we have only a single word of internal storage. +// 3. Reads may overtake an under-construction burst write, with the proviso +// that they cannot collide with the write data that is currently held. +// 4. Write data is NOT held indefinitely; a timer mechanism will flush out +// the write data if the burst has not been extended with further data. +// This is primarily to improve coherency with other ports of the HyperRAM, +// but it also reduces the probability of subsequent read operations being +// delayed. + +module hyperram_wrbuf #( + parameter int unsigned AW = 20, // Width of address, bits. + parameter int unsigned DW = 32, // Width of data, bits. + parameter int unsigned DBW = DW / 8, // Number of write strobes. + parameter int unsigned SeqWidth = 4, // Width of sequence number, bits. + parameter int unsigned Log2BurstLen = 5, // Maximum of 32 bytes/burst. + // Are reads permitted to overtake writes when there is no address collision? + parameter bit ReadsOvertakeWrites = 1'b1, + + // LSB of word address. + localparam int unsigned ABIT = $clog2(DW / 8) +) ( + input clk_i, + input rst_ni, + + // Data to be written to the Downstream FIFO. + output dfifo_wr_full_o, + input [DBW-1:0] dfifo_wr_strb_i, + input [DW-1:0] dfifo_wr_din_i, + + // Input command requests from the TL-UL port. + input cmd_req_i, + output cmd_wready_o, + input [AW-1:ABIT] cmd_mem_addr_i, + input [Log2BurstLen-ABIT:0] cmd_word_cnt_i, + input cmd_wr_not_rd_i, + input [SeqWidth-1:0] cmd_seq_i, + + // Modified write traffic to the Downstream FIFO. + output dfifo_wr_ena_o, + input dfifo_wr_full_i, + output [DBW-1:0] dfifo_wr_strb_o, + output [DW-1:0] dfifo_wr_din_o, + + // Modified command requests to the HyperRAM controller. + output cmd_req_o, + input cmd_wready_i, + output [AW-1:ABIT] cmd_mem_addr_o, + output [Log2BurstLen-ABIT:0] cmd_word_cnt_o, + output cmd_wr_not_rd_o, + output cmd_wrap_not_incr_o, + output [SeqWidth-1:0] cmd_seq_o +); + +// This may at some point become a full write buffer with the ability to coalesce +// a number of TL-UL write transactions that form a (nearly-)contiguous block of +// data but are received out of order. +// +// For now it addresses the simple cases that may be handled without the need for +// snooping of buffered write data. This provides significant performance benefit +// for the common case of contiguous ascending and - less so - descending word writes. +// +// A single word of write data is held internally before it is committed to the +// Downstream FIFO to the HBMC. Obviously data that has been committed cannot be +// retrieved/modified by this logic. Committing the data words to be the FIFO +// before determining the burst length and issuing the write command is only possible +// because _just one_ port (the LSU) implements write coalescing in this manner. + +localparam int unsigned BBIT = Log2BurstLen; + +// Number of bits in the write timeout counter. +localparam int unsigned TimerW = 5; + +// Is there a write transaction in this cycle? +wire wr_req = cmd_req_i & cmd_wr_not_rd_i; +// How about a read? +wire rd_req = cmd_req_i & ~cmd_wr_not_rd_i; + +// Retained burst details. +logic wr_stored; +logic [TimerW-1:0] wr_timer; +logic [AW-1:ABIT] base_addr_stored; +logic [Log2BurstLen-1-ABIT:0] burst_len_m1; // Bus words minus 1. +// Expectations about next write transaction. +logic [AW-1:ABIT] exp_addr; + +// Retained write strobes and data. +logic [DBW-1:0] strb_stored; +logic [DW-1:0] data_stored; + +// Address of the word immediately above the current transaction; we can use this to check +// contiguity for both ascending and descending accesses. +wire [AW-1:ABIT] next_addr = cmd_mem_addr_i + 'b1; + +// Is the new write contiguously above the previous write? +wire contig_above = (cmd_mem_addr_i == exp_addr); +// How about descending? +// Note: we can only accept a word that precedes the start when we have seen just one earlier word. +wire contig_below = (next_addr == base_addr_stored) & ~|burst_len_m1; +// Same address as previous write transaction? e.g. partial writes. There shouldn't really be +// any repeated full word stores to a single address, but we can handle them inexpensively. +wire addr_repeated = (next_addr == exp_addr); +// Can we coalesce a write transaction with an under-construction burst write? +// +// Note: this considers only the current transaction type, address and strobes; further +// qualification with the burst length may be required in the use of `coalesce`. +wire coalesce = &{wr_req, contig_above | contig_below | addr_repeated}; + +// Increment the burst length; we also use this logic when writing out the `cmd_word_cnt` because +// that HyperRAM controller expects a 1-based value and it's preferable to perform the increment +// here, on the lower clock frequency. +wire [Log2BurstLen-ABIT:0] next_len = burst_len_m1 + 'b1; + +// Command FIFO is preventing progress? +logic cmd_stalled; +// Downstream FIFO is preventing progress? +logic dfifo_stalled; +// Can state advance? +wire stalled = cmd_stalled | dfifo_stalled; + +// Read collision with current write burst? +// Note: the burst writes are linear, not wrapping, so it does not suffice to assume that the upper +// address bits are the same for all words within the burst. +// +// Note: this is a bit conservative to make the check less expensive; we could compare against the +// present length of the under-construction write burst, but this inexpensive test will catch the +// vast majority of cases, allowing reads to proceed without needlessly terminating the write burst. +wire addr_collision = wr_stored & + ((cmd_mem_addr_i[AW-1:BBIT] == base_addr_stored[AW-1:BBIT]) || + (cmd_mem_addr_i[AW-1:BBIT] == exp_addr[AW-1:BBIT]) || + !ReadsOvertakeWrites); // Treat all reads as collisions? +wire rd_collision = rd_req & addr_collision; + +// By reordering a pair of word writes we can coalesce writes to descending addresses; +// to achieve longer burst writes in this case would require a LIFO implementation since the +// HBMC and HyperRAM accept only ascending bursts. +logic wr_descending_del; + +// Maximum length burst; word count is maximum and the final word is complete. +wire burst_max = &{burst_len_m1, strb_stored}; + +// Flush out the current write burst because the present transaction cannot be combined with it; +// this requires writing to the Commmand FIFO, and being sure to do so no later than the final +// data word is written into the Downstream FIFO. +wire wr_timeout = ~|wr_timer; +wire flush_write = wr_stored & |{wr_req & !coalesce, // Write cannot be combined. + wr_descending_del, // Can only coalesce two descending words. + burst_max, // Maximum burst length reached. + rd_collision, // Read may collide with write burst. + wr_timeout}; // Write data too old. + +// Store this write in anticipation of constructing a burst write? +wire wr_start = wr_req & (!wr_stored | flush_write); + +always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + wr_stored <= 1'b0; + end else if ((wr_start | flush_write) & !stalled) begin + // We can start a new burst collection in the same cycle as flushing out the current one. + wr_stored <= wr_start; + end +end + +// Is this the second (and final) word of a descending write? +wire wr_descending = &{wr_stored, wr_req, contig_below}; + +// After we have spotted a write transaction to a descending address, we must be sure to write out +// the initial word and not continue collecting; otherwise data would be lost. +always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) wr_descending_del <= 1'b0; + else if (!stalled) wr_descending_del <= wr_descending; +end + +// Most write transactions are stored (delayed by at least one cycle to see whether coalescing is +// possible). The exception is when we already have one word stored and the current write +// transaction immediately precedes it, so we must reorder the words when writing them into the +// Downstream FIFO. +wire wr_storing = wr_req & ~(wr_stored & contig_below); + +// Do not hang onto write data indefinitely. +// +// This both prevents a subsequent read collision suffering additional delay and reduces the +// likelihood of coherency problems caused by writing code into the HyperRAM via the LSU Data port +// and then reading it back via the Instruction Fetch port. +always_ff @(posedge clk_i) begin + if (!stalled) begin + if (wr_start | wr_storing) wr_timer <= {TimerW{1'b1}}; + else if (wr_stored) wr_timer <= wr_timer - 'b1; + end +end + +// Burst properties; address and burst length tracking. +always_ff @(posedge clk_i) begin + if (!stalled) begin + if (wr_start) begin + // First word of a new burst. + base_addr_stored <= cmd_mem_addr_i; + burst_len_m1 <= 'h0; + // Expected address of next write transaction. + exp_addr <= next_addr; + end else if (wr_req) begin + // Contiguous ascending burst is the most common case. + if (contig_above) exp_addr <= next_addr; + // We can coalesce only two words for a descending burst. + if (contig_below) base_addr_stored <= cmd_mem_addr_i; + // Increment the burst length when a new word-aligned address is observed. + if (contig_above | contig_below) begin + burst_len_m1 <= next_len[Log2BurstLen-1-ABIT:0]; + end + end + end +end + +// We must stall if we need to send a command but the command FIFO is unavailable +// (this could be because it's unavailable or simply because we have not yet won arbitration). +assign cmd_stalled = cmd_req_o & !cmd_wready_i; + +// We can store only a single word of strobes/data locally before writing into the Downstream FIFO. +// If we spot a descending write then we must reorder the two words, writing out the second word +// immediately and its predecessor in the next stall-free cycle. +wire dfifo_wr_ena = flush_write | wr_descending | &{wr_stored, wr_storing, !addr_repeated}; +// We must stall the sender and our internal logic if we cannot proceed with a data write. +assign dfifo_stalled = dfifo_wr_ena & dfifo_wr_full_i; + +// Merge the write strobes when we receive a subsequent partial write to the same address. +wire merge_strobes = addr_repeated & ~wr_start; + +// Capturing of burst strobes/data. We support partial word writes, collecting the strobes and +// the data bytes. +always_ff @(posedge clk_i) begin + if (wr_storing & !stalled) begin + strb_stored <= dfifo_wr_strb_i | (strb_stored & {DBW{merge_strobes}}); + for (int unsigned b = 0; b < DBW; b++) begin + if (dfifo_wr_strb_i[b]) begin + data_stored[b*8 +: 8] <= dfifo_wr_din_i[b*8 +: 8]; + end + end + end +end + +// Write data out to the Downstream FIFO. Usually we're writing the stored values, but in the +// event of a descending write, we must reorder the two writes. +assign dfifo_wr_ena_o = dfifo_wr_ena & !cmd_stalled; +assign dfifo_wr_strb_o = wr_descending ? dfifo_wr_strb_i : strb_stored; +assign dfifo_wr_din_o = wr_descending ? dfifo_wr_din_i : data_stored; + +// Modified command traffic to the HyperRAM controller. +assign cmd_req_o = (flush_write & !dfifo_wr_full_i) | (rd_req & ~addr_collision); +assign cmd_mem_addr_o = (flush_write & !wr_descending) ? base_addr_stored : cmd_mem_addr_i; +assign cmd_word_cnt_o = flush_write ? next_len : cmd_word_cnt_i; +assign cmd_wr_not_rd_o = flush_write; +// Writes are linear, reads are wrapping. +assign cmd_wrap_not_incr_o = !flush_write; +// Sequence number applies only to read requests; it is simply returned in the Upstream FIFO. +assign cmd_seq_o = cmd_seq_i; + +// Stall the sender if we cannot accept the current transaction. +assign cmd_wready_o = ~|{rd_req & flush_write, // Must complete the write before we can read. + cmd_req_i & stalled}; // Cannot proceed when stalled. +// If we need to write a data word but the Downstream FIFO is full that implies that a previous +// command is still emptying the FIFO and - if necessary - we will stall the sender using +// `cmd_wready_o.` +assign dfifo_wr_full_o = 1'b0; + +endmodule + diff --git a/rtl/system/sonata_system.sv b/rtl/system/sonata_system.sv index c3b3c6992..793760203 100644 --- a/rtl/system/sonata_system.sv +++ b/rtl/system/sonata_system.sv @@ -298,10 +298,8 @@ module sonata_system tlul_pkg::tl_d2h_t tl_sram_a_d2h; tlul_pkg::tl_h2d_t tl_sram_b_h2d; tlul_pkg::tl_d2h_t tl_sram_b_d2h; - tlul_pkg::tl_h2d_t tl_hyperram_us_h2d[2]; - tlul_pkg::tl_d2h_t tl_hyperram_us_d2h[2]; - tlul_pkg::tl_h2d_t tl_hyperram_ds_h2d; - tlul_pkg::tl_d2h_t tl_hyperram_ds_d2h; + tlul_pkg::tl_h2d_t tl_hyperram_h2d[2]; + tlul_pkg::tl_d2h_t tl_hyperram_d2h[2]; tlul_pkg::tl_h2d_t tl_gpio_h2d; tlul_pkg::tl_d2h_t tl_gpio_d2h; tlul_pkg::tl_h2d_t tl_xadc_h2d; @@ -355,8 +353,8 @@ module sonata_system // Device interfaces. .tl_sram_o (tl_sram_a_h2d), .tl_sram_i (tl_sram_a_d2h), - .tl_hyperram_o (tl_hyperram_us_h2d[0]), - .tl_hyperram_i (tl_hyperram_us_d2h[0]), + .tl_hyperram_o (tl_hyperram_h2d[0]), + .tl_hyperram_i (tl_hyperram_d2h[0]), .tl_rev_tag_o (tl_rev_tag_h2d), .tl_rev_tag_i (tl_rev_tag_d2h), .tl_gpio_o (tl_gpio_h2d), @@ -405,8 +403,8 @@ module sonata_system // Devices. .tl_sram_o (tl_sram_b_h2d), .tl_sram_i (tl_sram_b_d2h), - .tl_hyperram_o (tl_hyperram_us_h2d[1]), - .tl_hyperram_i (tl_hyperram_us_d2h[1]), + .tl_hyperram_o (tl_hyperram_h2d[1]), + .tl_hyperram_i (tl_hyperram_d2h[1]), .tl_dbg_dev_o (tl_dbg_dev_us_h2d[0]), .tl_dbg_dev_i (tl_dbg_dev_us_d2h[0]), @@ -505,27 +503,25 @@ module sonata_system .tl_b_o (tl_sram_b_d2h) ); - // HyperRAM `ifdef TARGET_XL_BOARD - // No HyperRAM on Sonata XL, but we can replace it with internal block RAM - sram #( - .AddrWidth ( $clog2(HyperRAMSize) ), - .DataWidth ( BusDataWidth ), - .DataBitsPerMask ( DataBitsPerMask ), - .InitFile () - ) u_hyperram ( - .clk_i (clk_sys_i), - .rst_ni (rst_sys_ni), + // No HyperRAM on Sonata XL, so the build requires USE_HYPERRAM_SRAM_MODEL + wire [7:0] hyperram_dq; + wire hyperram_rwds; + wire hyperram_ckp; + wire hyperram_ckn; + wire hyperram_nrst; + wire hyperram_cs; + wire clk_hr_i = 1'b0; + wire clk_hr90p_i = 1'b0; + wire clk_hr3x_i = 1'b0; + wire rst_hr_ni = 1'b0; +`endif - .tl_a_i (tl_hyperram_ds_h2d), - .tl_a_o (tl_hyperram_ds_d2h), - .tl_b_i (), - .tl_b_o () - ); -`else + // HyperRAM hyperram #( .HyperRAMClkFreq ( HyperRAMClkFreq ), - .HyperRAMSize ( HyperRAMSize ) + .HyperRAMSize ( HyperRAMSize ), + .NumPorts ( 2 ) ) u_hyperram ( .clk_i (clk_sys_i), .rst_ni (rst_sys_ni), @@ -535,8 +531,8 @@ module sonata_system .clk_hr3x_i, .rst_hr_ni, - .tl_i (tl_hyperram_ds_h2d), - .tl_o (tl_hyperram_ds_d2h), + .tl_i (tl_hyperram_h2d), + .tl_o (tl_hyperram_d2h), .hyperram_dq, .hyperram_rwds, @@ -545,31 +541,6 @@ module sonata_system .hyperram_nrst, .hyperram_cs ); -`endif - - // Manual M:1 socket instantiation as xbar generator cannot deal with multiple ports for one - // device and we want to utilize the dual port SRAM. So totally separate crossbars are generated - // for the dside and iside then tlul_socket_m1 is used here to connect the two crossbars to the - // one downstream hyperram tilelink port. - // - // US == Upstream - // DS == Downstream - // - // US is the Ibex/Host end, DS is the Hyperram end. - tlul_socket_m1 #( - .HReqDepth (8'h0), - .HRspDepth (8'h0), - .DReqDepth (4'h0), - .DRspDepth (4'h0), - .M (2) - ) u_hyperram_tl_socket ( - .clk_i (clk_sys_i), - .rst_ni(rst_sys_ni), - .tl_h_i(tl_hyperram_us_h2d), - .tl_h_o(tl_hyperram_us_d2h), - .tl_d_o(tl_hyperram_ds_h2d), - .tl_d_i(tl_hyperram_ds_d2h) - ); tlul_socket_m1 #( .HReqDepth (8'h0), @@ -1330,4 +1301,9 @@ module sonata_system logic _unused_tsaddr; assign _unused_tsaddr = |tsmap_addr[TsMapAddrWidth-1:RevTagAddrWidth]; + +`ifdef TARGET_XL_BOARD + logic unused_hr; + assign unused_hr = ^{hyperram_ckp, hyperram_ckn, hyperram_cs, hyperram_nrst}; +`endif endmodule diff --git a/sonata.core b/sonata.core index 749f0aac3..ab20a66b4 100644 --- a/sonata.core +++ b/sonata.core @@ -110,10 +110,10 @@ parameters: paramtype: vlogdefine description: Primitives implementation to use, e.g. "prim_pkg::ImplGeneric". - USE_HYPERRAM_SIM_MODEL: + USE_HYPERRAM_SRAM_MODEL: datatype: bool paramtype: vlogdefine - description: Use an SRAM simulation model rather than the real hyperram controller + description: Use an SRAM implementation rather than the real HyperRAM controller TARGET_XL_BOARD: datatype: bool @@ -154,6 +154,7 @@ targets: parameters: - SRAMInitFile - PRIM_DEFAULT_IMPL=prim_pkg::ImplXilinx + - USE_HYPERRAM_SRAM_MODEL=true - TARGET_XL_BOARD=true sim: @@ -173,7 +174,7 @@ targets: - '--trace-structs' - '--trace-params' - '--trace-max-array 1024' - - '-CFLAGS "-Wall -DVM_TRACE_FMT_FST -DTOPLEVEL_NAME=top_verilator -DUSE_HYPERRAM_SIM_MODEL"' + - '-CFLAGS "-Wall -DVM_TRACE_FMT_FST -DTOPLEVEL_NAME=top_verilator -DUSE_HYPERRAM_SRAM_MODEL"' # Add "-DUSE_SEPARATED_CLOCKS" to CFLAGS for a more accurate simulation. - '-LDFLAGS "-pthread -lutil -lelf"' - "-Wall" @@ -184,7 +185,7 @@ targets: parameters: - USE_SEPARATED_CLOCKS=false - PRIM_DEFAULT_IMPL=prim_pkg::ImplGeneric - - USE_HYPERRAM_SIM_MODEL=true + - USE_HYPERRAM_SRAM_MODEL=true lint: <<: *default_target @@ -199,4 +200,4 @@ targets: - PRIM_DEFAULT_IMPL=prim_pkg::ImplGeneric # TODO: Introduce some blackboxes for the Xilinx IP used in the hyperram # controller so we can lint it, for now just exclude it from the lint run. - - USE_HYPERRAM_SIM_MODEL=true + - USE_HYPERRAM_SRAM_MODEL=true