Skip to content

Commit 32c791f

Browse files
Kathy Xufacebook-github-bot
authored andcommitted
add read load metrics (#4956)
Summary: Pull Request resolved: #4956 Add DRAM metrics to tbe stats report: - the read load cnts: measure loading amount Reviewed By: emlin Differential Revision: D83506472 fbshipit-source-id: eb36b588d0ecfa1da998f855169f5b092ff473ff
1 parent 56ecc59 commit 32c791f

File tree

2 files changed

+20
-3
lines changed

2 files changed

+20
-3
lines changed

fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3971,8 +3971,8 @@ def _report_dram_kv_perf_stats(self) -> None:
39713971
self.step, stats_reporter.report_interval # pyre-ignore
39723972
)
39733973

3974-
if len(dram_kv_perf_stats) != 23:
3975-
logging.error("dram cache perf stats should have 23 elements")
3974+
if len(dram_kv_perf_stats) != 24:
3975+
logging.error("dram cache perf stats should have 24 elements")
39763976
return
39773977

39783978
dram_read_duration = dram_kv_perf_stats[0]
@@ -4001,6 +4001,7 @@ def _report_dram_kv_perf_stats(self) -> None:
40014001
dram_kv_allocated_bytes = dram_kv_perf_stats[20]
40024002
dram_kv_actual_used_chunk_bytes = dram_kv_perf_stats[21]
40034003
dram_kv_num_rows = dram_kv_perf_stats[22]
4004+
dram_kv_read_counts = dram_kv_perf_stats[23]
40044005

40054006
stats_reporter.report_duration(
40064007
iteration_step=self.step,
@@ -4142,6 +4143,13 @@ def _report_dram_kv_perf_stats(self) -> None:
41424143
enable_tb_metrics=True,
41434144
)
41444145

4146+
stats_reporter.report_data_amount(
4147+
iteration_step=self.step,
4148+
event_name="dram_kv.perf.get.dram_kv_read_counts",
4149+
data_bytes=dram_kv_read_counts,
4150+
enable_tb_metrics=True,
4151+
)
4152+
41454153
stats_reporter.report_data_amount(
41464154
iteration_step=self.step,
41474155
event_name=self.dram_kv_allocated_bytes_stats_name,

fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,10 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
777777
const at::Tensor& count,
778778
int64_t width_offset = 0,
779779
std::optional<int64_t> width_length = std::nullopt) {
780+
auto read_count = count.scalar_type() == at::ScalarType::Long
781+
? *(count.data_ptr<int64_t>())
782+
: *(count.data_ptr<int32_t>());
783+
read_num_counts_ += read_count;
780784
// assuming get is called once each iteration and only by train
781785
// iteration(excluding state_dict)
782786
auto start_ts = facebook::WallClockUtil::NowInUsecFast();
@@ -1330,7 +1334,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
13301334
std::vector<double> get_dram_kv_perf(
13311335
const int64_t step,
13321336
const int64_t interval) {
1333-
std::vector<double> ret(23, 0); // num metrics
1337+
std::vector<double> ret(24, 0); // num metrics
13341338
if (step > 0 && step % interval == 0) {
13351339
int reset_val = 0;
13361340

@@ -1377,6 +1381,8 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
13771381
auto dram_bwd_l1_cnflct_miss_write_missing_load_ =
13781382
bwd_l1_cnflct_miss_write_missing_load_avg_.exchange(reset_val);
13791383

1384+
auto read_num_counts = read_num_counts_.exchange(reset_val);
1385+
13801386
ret[0] = dram_read_total_duration / interval;
13811387
ret[1] = dram_read_sharding_total_duration / interval;
13821388
ret[2] = dram_read_cache_hit_copy_duration / interval;
@@ -1404,6 +1410,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
14041410
ret[21] = get_map_actual_used_chunk_in_bytes();
14051411

14061412
ret[22] = get_num_rows();
1413+
ret[23] = read_num_counts / interval;
14071414
}
14081415
return ret;
14091416
}
@@ -1659,6 +1666,8 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
16591666
std::atomic<int64_t> inplace_update_hit_cnt_{0};
16601667
std::atomic<int64_t> inplace_update_miss_cnt_{0};
16611668

1669+
std::atomic<int64_t> read_num_counts_{0};
1670+
16621671
bool disable_random_init_;
16631672
}; // class DramKVEmbeddingCache
16641673

0 commit comments

Comments
 (0)