L1 hit prediction_table[hashed_pc]--

cwpeng0511 · cwpeng0511 · commit f582165a9898 · 2025-11-19T18:09:15.000+08:00
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
@@ -332,6 +332,51 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
   return MISS;
 }
 
+/// cwpeng
+void tag_array::set_hashed_pc_from_tag(new_addr_type addr, mem_fetch *mf, uint8_t hashed_pc){
+  unsigned set_index = m_config.set_index(addr);
+  new_addr_type tag = m_config.tag(addr);
+
+  // check for line in cache and update on HIT access with most recent PC. Rajesh CS752
+  for (unsigned way = 0; way < m_config.m_assoc; way++) {
+    unsigned index = set_index * m_config.m_assoc + way;
+    cache_block_t *line = m_lines[index];
+    if (line->m_tag == tag) {
+      line->m_hashed_pc = hashed_pc;
+    }
+  }
+}
+
+void tag_array::set_bypass_bit_from_tag(new_addr_type addr, mem_fetch *mf, bool bypassBit){
+  unsigned set_index = m_config.set_index(addr);
+  new_addr_type tag = m_config.tag(addr);
+
+  // check for line in cache and update on HIT access with most recent PC. Rajesh CS752
+  for (unsigned way = 0; way < m_config.m_assoc; way++) {
+    unsigned index = set_index * m_config.m_assoc + way;
+    cache_block_t *line = m_lines[index];
+    if (line->m_tag == tag) {
+      line->m_bypassBit = bypassBit;
+    }
+  }
+}
+
+bool tag_array::get_bypass_bit_from_tag(new_addr_type addr, mem_fetch *mf){
+  unsigned set_index = m_config.set_index(addr);
+  new_addr_type tag = m_config.tag(addr);
+
+  // check for line in cache and update on HIT access with most recent PC. Rajesh CS752
+  for (unsigned way = 0; way < m_config.m_assoc; way++) {
+    unsigned index = set_index * m_config.m_assoc + way;
+    cache_block_t *line = m_lines[index];
+    if (line->m_tag == tag) {
+      return line->m_bypassBit;
+    }
+  }
+}
+
+//cwpeng
+
 enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
                                             unsigned &idx, mem_fetch *mf) {
   bool wb = false;
@@ -1836,6 +1881,38 @@ enum cache_request_status data_cache::rd_hit_base(
   return HIT;
 }
 
+//cwpeng
+enum cache_request_status data_cache::rd_hit_base_l1d(
+    new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
+    std::list<cache_event> &events, enum cache_request_status status,
+    uint8_t* l1d_prediction_table //cwpeng  
+  ) {
+  new_addr_type block_addr = m_config.block_addr(addr);
+
+  uint8_t storedhashedPC = m_tag_array->get_hashed_pc_from_tag(addr, mf); // Rajesh CS752
+  printf("HashPC: %d\n", storedhashedPC) ;
+  if(l1d_prediction_table[storedhashedPC] > 0 ){ // Saturating counter stays 0 on 0
+    l1d_prediction_table[storedhashedPC]--;
+    //fprintf(stdout,"HIT Time: %d PC: %d Value: %d\n", time, storedhashedPC, l1d_prediction_table[storedhashedPC]);
+  }
+  m_tag_array->set_hashed_pc_from_tag(addr, mf, (uint8_t) mf->get_pc());  //cwpeng
+
+  m_tag_array->access(block_addr, time, cache_index, mf);
+  // Atomics treated as global read/write requests - Perform read, mark line as
+  // MODIFIED
+  if (mf->isatomic()) {
+    assert(mf->get_access_type() == GLOBAL_ACC_R);
+    cache_block_t *block = m_tag_array->get_block(cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
+    block->set_status(MODIFIED,
+                      mf->get_access_sector_mask());  // mark line as
+    block->set_byte_mask(mf);
+  }
+  return HIT;
+}
+
 /****** Read miss functions (Set by config file) ******/
 
 /// Baseline read miss: Send read request to lower level memory,
@@ -1969,6 +2046,52 @@ enum cache_request_status data_cache::process_tag_probe(
   return access_status;
 }
 
+enum cache_request_status data_cache::process_tag_probe(
+    bool wr, enum cache_request_status probe_status, new_addr_type addr,
+    unsigned cache_index, mem_fetch *mf, unsigned time,
+    std::list<cache_event> &events,
+    uint8_t* l1d_prediction_table //cwpeng
+  ) {
+  // Each function pointer ( m_[rd/wr]_[hit/miss] ) is set in the
+  // data_cache constructor to reflect the corresponding cache configuration
+  // options. Function pointers were used to avoid many long conditional
+  // branches resulting from many cache configuration options.
+  cache_request_status access_status = probe_status;
+  if (wr) {  // Write
+    if (probe_status == HIT) {
+      access_status =
+          (this->*m_wr_hit)(addr, cache_index, mf, time, events, probe_status);
+    } else if ((probe_status != RESERVATION_FAIL) ||
+               (probe_status == RESERVATION_FAIL &&
+                m_config.m_write_alloc_policy == NO_WRITE_ALLOCATE)) {
+      access_status =
+          (this->*m_wr_miss)(addr, cache_index, mf, time, events, probe_status);
+    } else {
+      // the only reason for reservation fail here is LINE_ALLOC_FAIL (i.e all
+      // lines are reserved)
+      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL,
+                             mf->get_streamID());
+    }
+  } else {  // Read
+    if (probe_status == HIT) {
+      access_status =
+          (this->*m_rd_hit_l1d)(addr, cache_index, mf, time, events, probe_status, l1d_prediction_table);
+    } else if (probe_status != RESERVATION_FAIL) {
+      access_status =
+          (this->*m_rd_miss)(addr, cache_index, mf, time, events, probe_status);
+    } else {
+      // the only reason for reservation fail here is LINE_ALLOC_FAIL (i.e all
+      // lines are reserved)
+      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL,
+                             mf->get_streamID());
+    }
+  }
+
+  m_bandwidth_management.use_data_port(mf, access_status, events);
+  return access_status;
+}
+
+
 // Both the L1 and L2 currently use the same access function.
 // Differentiation between the two caches is done through configuration
 // of caching policies.
@@ -1994,6 +2117,28 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
   return access_status;
 }
 
+enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
+                                             unsigned time,
+                                             std::list<cache_event> &events,
+                                             uint8_t* l1d_prediction_table // cwpeng
+                                             ) {
+  assert(mf->get_data_size() <= m_config.get_atom_sz());
+  bool wr = mf->get_is_write();
+  new_addr_type block_addr = m_config.block_addr(addr);
+  unsigned cache_index = (unsigned)-1;
+  enum cache_request_status probe_status =
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write(), true);
+  enum cache_request_status access_status =
+      process_tag_probe(wr, probe_status, addr, cache_index, mf, time, events, l1d_prediction_table); //cwpeng
+  m_stats.inc_stats(mf->get_access_type(),
+                    m_stats.select_stats_status(probe_status, access_status),
+                    mf->get_streamID());
+  m_stats.inc_stats_pw(mf->get_access_type(),
+                       m_stats.select_stats_status(probe_status, access_status),
+                       mf->get_streamID());
+  return access_status;
+}
+
 /// This is meant to model the first level data cache in Fermi.
 /// It is write-evict (global) or write-back (local) at the
 /// granularity of individual blocks (Set by GPGPU-Sim configuration file)
@@ -2004,6 +2149,13 @@ enum cache_request_status l1_cache::access(new_addr_type addr, mem_fetch *mf,
   return data_cache::access(addr, mf, time, events);
 }
 
+enum cache_request_status l1_cache::access(new_addr_type addr, mem_fetch *mf,
+                                           unsigned time,
+                                           std::list<cache_event> &events,
+                                           uint8_t* l1_prediction_table) { // cwpeng
+  return data_cache::access(addr, mf, time, events, l1_prediction_table);
+}
+
 // The l2 cache access function calls the base data_cache access
 // implementation.  When the L2 needs to diverge from L1, L2 specific
 // changes should be made here.
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
@@ -126,7 +126,7 @@ struct cache_block_t {
   cache_block_t() {
     m_tag = 0;
     m_block_addr = 0;
-    hashPC = 0 ; // cwpeng initialize hashed PC
+    m_hashed_pc = 0 ; // cwpeng initialize hashed PC
   }
 
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
@@ -169,7 +169,9 @@ struct cache_block_t {
   new_addr_type m_tag;
   new_addr_type m_block_addr;
 
-  uint8_t hashPC ; // cwpeng hashed PC in memory block (7 bits)
+  // uint8_t hashPC ; // cwpeng hashed PC in memory block (7 bits)
+  uint8_t m_hashed_pc; // cwpeng hashed PC in memory block (7 bits)
+  bool m_bypassBit; // rajesh cs752 L2 Bypass Bit
 };
 
 struct line_cache_block : public cache_block_t {
@@ -182,6 +184,9 @@ struct line_cache_block : public cache_block_t {
     m_set_modified_on_fill = false;
     m_set_readable_on_fill = false;
     m_readable = true;
+
+    m_hashed_pc = 0 ;  // cwpeng initialize hashed PC
+                  // record the last PC that access this block
   }
   void allocate(new_addr_type tag, new_addr_type block_addr, unsigned time,
                 mem_access_sector_mask_t sector_mask) {
@@ -988,6 +993,11 @@ class tag_array {
   void remove_pending_line(mem_fetch *mf);
   void inc_dirty() { m_dirty++; }
 
+  uint8_t get_hashed_pc_from_tag(new_addr_type addr,  mem_fetch *mf);
+  void set_hashed_pc_from_tag(new_addr_type addr,  mem_fetch *mf, uint8_t hashed_pc);
+  void set_bypass_bit_from_tag(new_addr_type addr, mem_fetch *mf, bool bypassBit);
+  bool get_bypass_bit_from_tag(new_addr_type addr, mem_fetch *mf);
+
  protected:
   // This constructor is intended for use only from derived classes that wish to
   // avoid unnecessary memory allocation that takes place in the
@@ -1531,6 +1541,7 @@ class data_cache : public baseline_cache {
 
     // Set read hit function
     m_rd_hit = &data_cache::rd_hit_base;
+    m_rd_hit_l1d = &data_cache::rd_hit_base_l1d;
 
     // Set read miss function
     m_rd_miss = &data_cache::rd_miss_base;
@@ -1582,6 +1593,12 @@ class data_cache : public baseline_cache {
                                            unsigned time,
                                            std::list<cache_event> &events);
 
+  virtual enum cache_request_status access(new_addr_type addr, mem_fetch *mf,
+                                           unsigned time,
+                                           std::list<cache_event> &events,
+                                           uint8_t* l1_prediction_table // cwpeng
+                                           );
+
  protected:
   data_cache(const char *name, cache_config &config, int core_id, int type_id,
              mem_fetch_interface *memport, mem_fetch_allocator *mfcreator,
@@ -1612,6 +1629,15 @@ class data_cache : public baseline_cache {
                                               mem_fetch *mf, unsigned time,
                                               std::list<cache_event> &events);
 
+  enum cache_request_status process_tag_probe(bool wr,
+                                              enum cache_request_status status,
+                                              new_addr_type addr,
+                                              unsigned cache_index,
+                                              mem_fetch *mf, unsigned time,
+                                              std::list<cache_event> &events,
+                                              uint8_t* l1_prediction_table // cwpeng
+                                              );
+
  protected:
   mem_fetch_allocator *m_memfetch_creator;
 
@@ -1681,12 +1707,23 @@ class data_cache : public baseline_cache {
   enum cache_request_status (data_cache::*m_rd_hit)(
       new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
       std::list<cache_event> &events, enum cache_request_status status);
+  enum cache_request_status (data_cache::*m_rd_hit_l1d)( //cwpeng
+      new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
+      std::list<cache_event> &events, enum cache_request_status status, uint8_t *l1d_prediction_table);
   enum cache_request_status rd_hit_base(new_addr_type addr,
                                         unsigned cache_index, mem_fetch *mf,
                                         unsigned time,
                                         std::list<cache_event> &events,
                                         enum cache_request_status status);
 
+  enum cache_request_status rd_hit_base_l1d(new_addr_type addr,
+                                        unsigned cache_index, mem_fetch *mf,
+                                        unsigned time,
+                                        std::list<cache_event> &events,
+                                        enum cache_request_status status,
+                                        uint8_t* l1_prediction_table // cwpeng
+                                        );
+
   /******* Read-miss configs *******/
   enum cache_request_status (data_cache::*m_rd_miss)(
       new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
@@ -1719,7 +1756,14 @@ class l1_cache : public data_cache {
 
   virtual enum cache_request_status access(new_addr_type addr, mem_fetch *mf,
                                            unsigned time,
-                                           std::list<cache_event> &events);
+                                           std::list<cache_event> &events
+                                           );
+
+  virtual enum cache_request_status access(new_addr_type addr, mem_fetch *mf,
+                                           unsigned time,
+                                           std::list<cache_event> &events,
+                                           uint8_t* l1_prediction_table // cwpeng
+                                           );
 
   uint8_t prediction_table[128] ; // cwpeng prediction table in L1 cache (4 bits each entry)
 
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
@@ -2127,7 +2127,7 @@ void ldst_unit::L1_latency_queue_cycle() {
           m_L1D->access(mf_next->get_addr(), mf_next,
                         m_core->get_gpu()->gpu_sim_cycle +
                             m_core->get_gpu()->gpu_tot_sim_cycle,
-                        events);
+                        events, m_L1D->prediction_table);
 
       bool write_sent = was_write_sent(events);
       bool read_sent = was_read_sent(events);