darchr · aakahlow · May 5, 2022 · May 5, 2022 · May 5, 2022 · May 5, 2022
diff --git a/src/mem/DRAMInterface.py b/src/mem/DRAMInterface.py
@@ -79,11 +79,18 @@ class DRAMInterface(MemInterface):
     # timing behaviour and constraints - all in nanoseconds
 
     # the amount of time in nanoseconds from issuing an activate command
-    # to the data being available in the row buffer for a read/write
-    tRCD = Param.Latency("RAS to CAS delay")
+    # to the data being available in the row buffer for a read
+    tRCD = Param.Latency("RAS to Read CAS delay")
 
-    # the time from issuing a read/write command to seeing the actual data
-    tCL = Param.Latency("CAS latency")
+    # the amount of time in nanoseconds from issuing an activate command
+    # to the data being available in the row buffer for a write
+    tRCD_WR = Param.Latency(Self.tRCD, "RAS to Write CAS delay")
+
+    # the time from issuing a read command to seeing the actual data
+    tCL = Param.Latency("Read CAS latency")
+
+    # the time from issuing a write command to seeing the actual data
+    tCWL = Param.Latency(Self.tCL, "Write CAS latency")
 
     # minimum time between a precharge and subsequent activate
     tRP = Param.Latency("Row precharge time")
@@ -1145,6 +1152,88 @@ class HBM_1000_4H_1x64(HBM_1000_4H_1x128):
     # self refresh exit time
     tXS = '65ns'
 
+# A single HBM2 x64 interface (tested with HBMCtrl in gem5)
+# to be used as a single pseudo channel. The timings are based
+# on HBM gen2 specifications. 4H stack, 8Gb per die and total capacity
+# of 4GiB.
+
+class HBM_2000_4H_1x64(DRAMInterface):
+
+    # 64-bit interface for a single pseudo channel
+    device_bus_width = 64
+
+    # HBM2 supports BL4
+    burst_length = 4
+
+    # size of channel in bytes, 4H stack of 8Gb dies is 4GiB per stack;
+    # with 16 pseudo channels, 256MiB per pseudo channel
+    device_size = "256MiB"
+
+    device_rowbuffer_size = "1KiB"
+
+    # 1x128 configuration
+    devices_per_rank = 1
+
+    ranks_per_channel = 1
+
+    banks_per_rank = 16
+    bank_groups_per_rank = 4
+
+    # 1000 MHz for 2Gbps DDR data rate
+    tCK = "1ns"
+
+    tRP = "14ns"
+
+    tCCD_L = "3ns"
+
+    tRCD = "12ns"
+    tRCD_WR = "6ns"
+    tCL = "18ns"
+    tCWL = "7ns"
+    tRAS = "28ns"
+
+    # BL4 in pseudo channel mode
+    # DDR @ 1000 MHz means 4 * 1ns / 2 = 2ns
+    tBURST = "2ns"
+
+    # value for 2Gb device from JEDEC spec
+    tRFC = "220ns"
+
+    # value for 2Gb device from JEDEC spec
+    tREFI = "3.9us"
+
+    tWR = "14ns"
+    tRTP = "5ns"
+    tWTR = "4ns"
+    tWTR_L = "9ns"
+    tRTW = "18ns"
+
+    #tAAD from RBus
+    tAAD = "1ns"
+
+    # single rank device, set to 0
+    tCS = "0ns"
+
+    tRRD = "4ns"
+    tRRD_L = "6ns"
+
+    # for a single pseudo channel
+    tXAW = "16ns"
+    activation_limit = 4
+
+    # 4tCK
+    tXP = "8ns"
+
+    # start with tRFC + tXP -> 160ns + 8ns = 168ns
+    tXS = "216ns"
+
+    page_policy = 'close_adaptive'
+
+    read_buffer_size = 64
+    write_buffer_size = 64
+
+    two_cycle_activate = True
+
 # A single LPDDR5 x16 interface (one command/address bus)
 # for a single x16 channel with default timings based on
 # initial JEDEC specification

diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.SimpleMemCtrl import *
+from m5.objects.DRAMInterface import HBM_2000_4H_1x64
+
+# HBMCtrl manages two pseudo channels of HBM2
+
+class HBMCtrl(SimpleMemCtrl):
+    type = 'HBMCtrl'
+    cxx_header = "mem/hbm_ctrl.hh"
+    cxx_class = 'gem5::memory::HBMCtrl'
+
+    # HBMCtrl uses the SimpleMemCtlr's interface
+    # `dram` as the first pseudo channel, the second
+    # pseudo channel interface is following
+    # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces
+    dram_2 = Param.DRAMInterface(HBM_2000_4H_1x64(), "Memory interface")
+
+    # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces
+    # gives the best results with following min_r/w_per_switch
+    min_reads_per_switch = 64
+    min_writes_per_switch = 64
+
+    partitioned_q = Param.Bool(True, "split queues for pseudo channels")
diff --git a/src/mem/SConscript b/src/mem/SConscript
@@ -51,6 +51,7 @@ DebugFlag('SysBridge')
 SimObject('SimpleMemCtrl.py', sim_objects=['SimpleMemCtrl'],
         enums=['MemSched'])
 SimObject('MemCtrl.py', sim_objects=['MemCtrl'])
+SimObject('HBMCtrl.py', sim_objects=['HBMCtrl'])
 SimObject('MemInterface.py', sim_objects=['MemInterface'], enums=['AddrMap'])
 SimObject('DRAMInterface.py', sim_objects=['DRAMInterface'],
         enums=['PageManage'])
@@ -77,6 +78,7 @@ Source('external_master.cc')
 Source('external_slave.cc')
 Source('simple_mem_ctrl.cc')
 Source('mem_ctrl.cc')
+Source('hbm_ctrl.cc')
 Source('mem_interface.cc')
 Source('dram_interface.cc')
 Source('nvm_interface.cc')

diff --git a/src/mem/SimpleMemCtrl.py b/src/mem/SimpleMemCtrl.py
@@ -77,6 +77,10 @@ class SimpleMemCtrl(QoSMemCtrl):
     min_writes_per_switch = Param.Unsigned(16, "Minimum write bursts before "
                                            "switching to reads")
 
+    # minimum read bursts to schedule before switching back to writes
+    min_reads_per_switch = Param.Unsigned(16, "Minimum read bursts before "
+                                           "switching to writes")
+
     # scheduler, address map and page policy
     mem_sched_policy = Param.MemSched('frfcfs', "Memory scheduling policy")
 

diff --git a/src/mem/dram_interface.cc b/src/mem/dram_interface.cc
@@ -88,7 +88,7 @@ DRAMInterface::chooseNextFRFCFS(MemPacketQueue& queue, Tick min_col_at) const
         MemPacket* pkt = *i;
 
         // select optimal DRAM packet in Q
-        if (pkt->isDram()) {
+        if (pkt->isDram() && (pkt->pseudoChannel == pseudoChannel)) {
             const Bank& bank = ranks[pkt->rank]->banks[pkt->bank];
             const Tick col_allowed_at = pkt->isRead() ? bank.rdAllowedAt :
                                                         bank.wrAllowedAt;
@@ -183,7 +183,7 @@ DRAMInterface::activateBank(Rank& rank_ref, Bank& bank_ref,
     if (twoCycleActivate)
         act_at = ctrl->verifyMultiCmd(act_tick, maxCommandsPerWindow, tAAD);
     else
-        act_at = ctrl->verifySingleCmd(act_tick, maxCommandsPerWindow);
+        act_at = ctrl->verifySingleCmd(act_tick, maxCommandsPerWindow, true);
 
     DPRINTF(DRAM, "Activate at tick %d\n", act_at);
 
@@ -214,8 +214,9 @@ DRAMInterface::activateBank(Rank& rank_ref, Bank& bank_ref,
     bank_ref.preAllowedAt = act_at + tRAS;
 
     // Respect the row-to-column command delay for both read and write cmds
-    bank_ref.rdAllowedAt = std::max(act_at + tRCD, bank_ref.rdAllowedAt);
-    bank_ref.wrAllowedAt = std::max(act_at + tRCD, bank_ref.wrAllowedAt);
+    bank_ref.rdAllowedAt = std::max(act_at + tRCD_RD, bank_ref.rdAllowedAt);
+    bank_ref.wrAllowedAt = std::max(act_at + tRCD_WR, bank_ref.wrAllowedAt);
+
 
     // start by enforcing tRRD
     for (int i = 0; i < banksPerRank; i++) {
@@ -301,7 +302,7 @@ DRAMInterface::prechargeBank(Rank& rank_ref, Bank& bank, Tick pre_tick,
         // Issuing an explicit PRE command
         // Verify that we have command bandwidth to issue the precharge
         // if not, shift to next burst window
-        pre_at = ctrl->verifySingleCmd(pre_tick, maxCommandsPerWindow);
+        pre_at = ctrl->verifySingleCmd(pre_tick, maxCommandsPerWindow, true);
         // enforce tPPD
         for (int i = 0; i < banksPerRank; i++) {
             rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD,
@@ -399,10 +400,11 @@ DRAMInterface::doBurstAccess(MemPacket* mem_pkt, Tick next_burst_at,
 
     // verify that we have command bandwidth to issue the burst
     // if not, shift to next burst window
-    if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) > clkResyncDelay))
+    Tick max_sync = clkResyncDelay + (mem_pkt->isRead() ? tRL : tWL);
+    if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) > max_sync))
         cmd_at = ctrl->verifyMultiCmd(cmd_at, maxCommandsPerWindow, tCK);
     else
-        cmd_at = ctrl->verifySingleCmd(cmd_at, maxCommandsPerWindow);
+        cmd_at = ctrl->verifySingleCmd(cmd_at, maxCommandsPerWindow, false);
 
     // if we are interleaving bursts, ensure that
     // 1) we don't double interleave on next burst issue
@@ -423,7 +425,11 @@ DRAMInterface::doBurstAccess(MemPacket* mem_pkt, Tick next_burst_at,
     DPRINTF(DRAM, "Schedule RD/WR burst at tick %d\n", cmd_at);
 
     // update the packet ready time
-    mem_pkt->readyTime = cmd_at + tCL + tBURST;
+    if (mem_pkt->isRead()) {
+        mem_pkt->readyTime = cmd_at + tRL + tBURST;
+    } else {
+        mem_pkt->readyTime = cmd_at + tWL + tBURST;
+    }
 
     rank_ref.lastBurstTick = cmd_at;
 
@@ -513,6 +519,13 @@ DRAMInterface::doBurstAccess(MemPacket* mem_pkt, Tick next_burst_at,
             // 3) make sure we are not considering the packet that we are
             //    currently dealing with
             while (!got_more_hits && p != queue[i].end()) {
+
+                if ((*p)->pseudoChannel != pseudoChannel) {
+                    // only consider if this pkt belongs to this interface
+                    ++p;
+                    continue;
+                }
+
                 if (mem_pkt != (*p)) {
                     bool same_rank_bank = (mem_pkt->rank == (*p)->rank) &&
                                           (mem_pkt->bank == (*p)->bank);
@@ -625,19 +638,21 @@ DRAMInterface::DRAMInterface(const DRAMInterfaceParams &_p)
     : MemInterface(_p),
       bankGroupsPerRank(_p.bank_groups_per_rank),
       bankGroupArch(_p.bank_groups_per_rank > 0),
-      tCL(_p.tCL),
+      tRL(_p.tCL),
+      tWL(_p.tCWL),
       tBURST_MIN(_p.tBURST_MIN), tBURST_MAX(_p.tBURST_MAX),
-      tCCD_L_WR(_p.tCCD_L_WR), tCCD_L(_p.tCCD_L), tRCD(_p.tRCD),
+      tCCD_L_WR(_p.tCCD_L_WR), tCCD_L(_p.tCCD_L),
+      tRCD_RD(_p.tRCD), tRCD_WR(_p.tRCD_WR),
       tRP(_p.tRP), tRAS(_p.tRAS), tWR(_p.tWR), tRTP(_p.tRTP),
       tRFC(_p.tRFC), tREFI(_p.tREFI), tRRD(_p.tRRD), tRRD_L(_p.tRRD_L),
       tPPD(_p.tPPD), tAAD(_p.tAAD),
       tXAW(_p.tXAW), tXP(_p.tXP), tXS(_p.tXS),
-      clkResyncDelay(tCL + _p.tBURST_MAX),
+      clkResyncDelay(_p.tBURST_MAX),
       dataClockSync(_p.data_clock_sync),
       burstInterleave(tBURST != tBURST_MIN),
       twoCycleActivate(_p.two_cycle_activate),
       activationLimit(_p.activation_limit),
-      wrToRdDlySameBG(tCL + _p.tBURST_MAX + _p.tWTR_L),
+      wrToRdDlySameBG(tWL + _p.tBURST_MAX + _p.tWTR_L),
       rdToWrDlySameBG(_p.tRTW + _p.tBURST_MAX),
       pageMgmt(_p.page_policy),
       maxAccessesPerRow(_p.max_accesses_per_row),
@@ -819,7 +834,7 @@ DRAMInterface::isBusy(bool read_queue_empty, bool all_writes_nvm)
 
 MemPacket*
 DRAMInterface::decodePacket(const PacketPtr pkt, Addr pkt_addr,
-                       unsigned size, bool is_read)
+                       unsigned size, bool is_read, uint8_t pseudo_channel)
 {
     // decode the address based on the address mapping scheme, with
     // Ro, Ra, Co, Ba and Ch denoting row, rank, column, bank and
@@ -899,8 +914,8 @@ DRAMInterface::decodePacket(const PacketPtr pkt, Addr pkt_addr,
     // later
     uint16_t bank_id = banksPerRank * rank + bank;
 
-    return new MemPacket(pkt, is_read, true, rank, bank, row, bank_id,
-                   pkt_addr, size);
+    return new MemPacket(pkt, is_read, true, pseudo_channel, rank, bank, row,
+                   bank_id, pkt_addr, size);
 }
 
 void DRAMInterface::setupRank(const uint8_t rank, const bool is_read)
@@ -1017,10 +1032,6 @@ DRAMInterface::minBankPrep(const MemPacketQueue& queue,
     Tick min_act_at = MaxTick;
     std::vector<uint32_t> bank_mask(ranksPerChannel, 0);
 
-    // latest Tick for which ACT can occur without incurring additoinal
-    // delay on the data bus
-    const Tick hidden_act_max = std::max(min_col_at - tRCD, curTick());
-
     // Flag condition when burst can issue back-to-back with previous burst
     bool found_seamless_bank = false;
 
@@ -1032,6 +1043,9 @@ DRAMInterface::minBankPrep(const MemPacketQueue& queue,
     // bank in question
     std::vector<bool> got_waiting(ranksPerChannel * banksPerRank, false);
     for (const auto& p : queue) {
+        if (p->pseudoChannel != pseudoChannel)
+            continue;
+
         if (p->isDram() && ranks[p->rank]->inRefIdleState())
             got_waiting[p->bankId] = true;
     }
@@ -1054,6 +1068,13 @@ DRAMInterface::minBankPrep(const MemPacketQueue& queue,
                     std::max(ranks[i]->banks[j].actAllowedAt, curTick()) :
                     std::max(ranks[i]->banks[j].preAllowedAt, curTick()) + tRP;
 
+                // latest Tick for which ACT can occur without
+                // incurring additoinal delay on the data bus
+                const Tick tRCD = ctrl->inReadBusState(false) ?
+                                                 tRCD_RD : tRCD_WR;
+                const Tick hidden_act_max =
+                            std::max(min_col_at - tRCD, curTick());
+
                 // When is the earliest the R/W burst can issue?
                 const Tick col_allowed_at = ctrl->inReadBusState(false) ?
                                               ranks[i]->banks[j].rdAllowedAt :
@@ -1288,11 +1309,10 @@ DRAMInterface::Rank::processRefreshEvent()
         // if a request is at the moment being handled and this request is
         // accessing the current rank then wait for it to finish
         if ((rank == dram.activeRank)
-            && (dram.ctrl->requestEventScheduled())) {
+            && (dram.ctrl->requestEventScheduled(dram.pseudoChannel))) {
             // hand control over to the request loop until it is
             // evaluated next
             DPRINTF(DRAM, "Refresh awaiting draining\n");
-
             return;
         } else {
             refreshState = REF_PD_EXIT;
@@ -1649,10 +1669,10 @@ DRAMInterface::Rank::processPowerEvent()
         }
 
         // completed refresh event, ensure next request is scheduled
-        if (!dram.ctrl->requestEventScheduled()) {
+        if (!(dram.ctrl->requestEventScheduled(dram.pseudoChannel))) {
             DPRINTF(DRAM, "Scheduling next request after refreshing"
                            " rank %d\n", rank);
-            dram.ctrl->restartScheduler(curTick());
+            dram.ctrl->restartScheduler(curTick(), dram.pseudoChannel);
         }
     }