Skip to content

Commit 37fee97

Browse files
committed
remove caching in dynamicemb option
1 parent dc86908 commit 37fee97

File tree

14 files changed

+48
-129
lines changed

14 files changed

+48
-129
lines changed

corelib/dynamicemb/benchmark/benchmark_batched_dynamicemb_tables.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,6 @@ def create_dynamic_embedding_tables(args, device):
322322
score_strategy=DynamicEmbScoreStrategy.LFU
323323
if args.cache_algorithm == "lfu"
324324
else DynamicEmbScoreStrategy.TIMESTAMP,
325-
caching=args.caching,
326325
)
327326
)
328327

corelib/dynamicemb/dynamicemb/batched_dynamicemb_tables.py

Lines changed: 28 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -524,48 +524,11 @@ def __init__(
524524
dtype=self.embedding_dtype,
525525
)
526526
)
527-
528-
def _enable_cache_decision(
529-
self, caching, hbm_budget, value_size, capacity, bucket_capacity
530-
) -> Tuple[bool, int, bool]:
531-
"""
532-
The principles are as follows:
533-
1. Cannot exceed the budget
534-
2. If the budget is too small, HBM is not necessary
535-
"""
536-
537-
# if not caching, it is the flag to represent the single table on device or host.
538-
on_device = True
539-
540-
if not caching:
541-
if hbm_budget == 0:
542-
on_device = False
543-
elif hbm_budget > 0 and hbm_budget < value_size * capacity:
544-
# Caching is not enabled but HBM is not enough to hold the entire table, so adjust to caching mode
545-
# to utilize the reserved HBM
546-
caching = True
547-
548-
if caching:
549-
# If the budget is too small, HBM is not necessary
550-
if hbm_budget < bucket_capacity * value_size:
551-
warnings.warn(
552-
"The HBM budget is too small to serve as a cache, fallback to host table.",
553-
UserWarning,
554-
)
555-
caching = False
556-
on_device = False
557-
558-
if caching:
559-
cache_capacity = hbm_budget // value_size
560-
else:
561-
cache_capacity = -1
562-
563-
return caching, cache_capacity, on_device
564527

565528
def _create_cache_storage(self) -> None:
566529
self._storages: List[Storage] = []
567530
self._caches: List[Cache] = []
568-
self._caching = self._dynamicemb_options[0].caching
531+
self._caching = False
569532

570533
for option in self._dynamicemb_options:
571534
if option.training and option.optimizer_type == OptimizerType.Null:
@@ -576,28 +539,39 @@ def _create_cache_storage(self) -> None:
576539
"Set OptimizerType to Null as not on training mode.", UserWarning
577540
)
578541

579-
bucket_capacity_for_cache = 1024
580-
581-
caching, cache_capacity, on_device = self._enable_cache_decision(
582-
option.caching,
583-
option.local_hbm_for_values,
584-
get_value_size(
585-
option.embedding_dtype,
586-
option.dim,
587-
option.optimizer_type,
588-
),
589-
option.max_capacity,
590-
bucket_capacity_for_cache,
542+
value_size = get_value_size(
543+
option.embedding_dtype,
544+
option.dim,
545+
option.optimizer_type,
591546
)
547+
total_table_bytes = value_size * option.max_capacity
548+
hbm_budget = option.local_hbm_for_values
549+
550+
if hbm_budget == 0:
551+
# No HBM budget -> storage only, on host
552+
option.local_hbm_for_values = 0
553+
self._caches.append(None)
554+
self._storages.append(DynamicEmbeddingTable(option, self._optimizer))
555+
elif total_table_bytes <= hbm_budget:
556+
# Entire table fits in HBM -> single table on GPU serves as
557+
# both cache and storage (no eviction needed).
558+
self._caching = True
559+
table = DynamicEmbeddingTable(option, self._optimizer)
560+
self._caches.append(table)
561+
self._storages.append(table)
562+
else:
563+
# Partial HBM -> cache (GPU) + storage (host or external)
564+
self._caching = True
565+
bucket_capacity_for_cache = 1024
566+
cache_capacity = hbm_budget // value_size
592567

593-
if caching:
594568
cache_option = deepcopy(option)
595569
cache_option.bucket_capacity = bucket_capacity_for_cache
596-
597570
cache_option.max_capacity = cache_capacity
598571
cache_option.init_capacity = cache_capacity
599-
600-
self._caches.append(DynamicEmbeddingTable(cache_option, self._optimizer))
572+
self._caches.append(
573+
DynamicEmbeddingTable(cache_option, self._optimizer)
574+
)
601575

602576
storage_option = deepcopy(option)
603577
storage_option.local_hbm_for_values = 0
@@ -607,11 +581,6 @@ def _create_cache_storage(self) -> None:
607581
if PS
608582
else DynamicEmbeddingTable(storage_option, self._optimizer)
609583
)
610-
else:
611-
self._caches.append(None)
612-
if not on_device:
613-
option.local_hbm_for_values = 0
614-
self._storages.append(DynamicEmbeddingTable(option, self._optimizer))
615584

616585
_print_memory_consume(
617586
self._table_names, self._dynamicemb_options, self._optimizer, self.device_id

corelib/dynamicemb/dynamicemb/dynamicemb_config.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -222,22 +222,6 @@ class DynamicEmbTableOptions(_ContextOptions):
222222
For `UNIFORM` and `TRUNCATED_NORMAL`, the `lower` and `upper` will set to $\pm {1 \over \sqrt{EmbeddingConfig.num\_embeddings}}$.
223223
eval_initializer_args: DynamicEmbInitializerArgs
224224
The initializer args for evaluation mode, and will return torch.zeros(...) as embedding by default if index/sparse feature is missing.
225-
caching: bool
226-
Flag to indicate dynamic embedding tables is working on caching mode, default to `False`.
227-
When the device memory on a single GPU is insufficient to accommodate a single shard of the dynamic embedding table,
228-
dynamicemb supports the mixed use of device memory and host memory(pinned memory).
229-
But by default, the values of the entire table are concatenated with device memory and host memory.
230-
This means that the storage location of one embedding is determined by `hash_function(key)`, and mapping to device memory will bring better lookup performance.
231-
However, sparse features in training are often with temporal locality.
232-
In order to store hot keys in device memory, dynamicemb creates two table instances,
233-
whose values are stored in device memory and host memory respectively, and store hot keys on the GPU table priorily.
234-
If the GPU table is full, the evicted keys will be inserted into the host table.
235-
If the host table is also full, the key will be evicted(all the eviction is based on the score per key).
236-
The original intention of eviction is based on this insight: features that only appear once should not occupy memory(even host memory) for a long time.
237-
In short:
238-
set **`caching=True`** will create a GPU table and a host table, and make GPU table serves as a cache;
239-
set **`caching=False`** will create a hybrid table which use GPU and host memory in a concatenated way to store value.
240-
All keys and other meta data are always stored on GPU for both cases.
241225
init_capacity : Optional[int], optional
242226
The initial capacity of the table. If not set, it defaults to max_capacity after sharding.
243227
If `init_capacity` is provided, it will serve as the initial table capacity on a single GPU.
@@ -265,9 +249,9 @@ class DynamicEmbTableOptions(_ContextOptions):
265249
Please refer to the API documentation for DynamicEmbCheckMode for more information.
266250
global_hbm_for_values : int
267251
Total GPU memory allocated to store embedding + optimizer states, in bytes. Default is 0.
268-
It has different meanings under `caching=True` and `caching=False`.
269-
When `caching=False`, it decides how much GPU memory is in the total memory to store value in a single hybrid table.
270-
When `caching=True`, it decides the table capacity of the GPU table.
252+
If the budget can hold the entire table (max_capacity * value_size), the table lives entirely on GPU.
253+
If the budget is nonzero but smaller, it determines the GPU cache capacity while the full table is stored on host/external storage.
254+
If zero, the table is stored entirely on host memory.
271255
external_storage: Storage
272256
The external storage/ParamterServer which inherits the interface of Storage, and can be configured per table.
273257
If not provided, will using DynamicEmbeddingTable as the Storage.
@@ -297,7 +281,6 @@ class DynamicEmbTableOptions(_ContextOptions):
297281
value=0.0,
298282
)
299283
)
300-
caching: bool = False
301284
init_capacity: Optional[
302285
int
303286
] = None # if not set then set to max_capcacity after sharded
@@ -339,7 +322,6 @@ def __ne__(self, other):
339322
def get_grouped_key(self):
340323
grouped_key = {}
341324
grouped_key["training"] = self.training
342-
grouped_key["caching"] = self.caching
343325
grouped_key["external_storage"] = self.external_storage
344326
grouped_key["index_type"] = self.index_type
345327
grouped_key["score_strategy"] = self.score_strategy

corelib/dynamicemb/example/example.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,6 @@ def get_planner(
521521
mode=DynamicEmbInitializerMode.NORMAL
522522
),
523523
score_strategy=DynamicEmbScoreStrategy.STEP,
524-
caching=caching,
525524
training=training,
526525
admit_strategy=admit_strategy,
527526
admission_counter=admission_counter,
@@ -596,7 +595,7 @@ def apply_dmp(model, args, training):
596595
args.batch_size,
597596
optimizer_type=optimizer_type,
598597
training=training,
599-
caching=args.caching,
598+
caching=args.caching, # used for HBM budget calculation, not passed to options
600599
args=args,
601600
)
602601
# get plan for all ranks.

corelib/dynamicemb/test/test_batched_dynamic_embedding_tables_v2.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,6 @@ def test_forward_train_eval(
482482
embedding_dtype=value_type,
483483
device_id=device_id,
484484
score_strategy=DynamicEmbScoreStrategy.TIMESTAMP,
485-
caching=caching,
486485
local_hbm_for_values=1024**3,
487486
external_storage=PS,
488487
)
@@ -648,7 +647,6 @@ def test_backward(opt_type, opt_params, caching, pooling_mode, dims, determinist
648647
embedding_dtype=value_type,
649648
device_id=device_id,
650649
score_strategy=DynamicEmbScoreStrategy.TIMESTAMP,
651-
caching=caching,
652650
local_hbm_for_values=1024**3,
653651
external_storage=PS,
654652
)
@@ -791,7 +789,6 @@ def test_prefetch_flush_in_cache(opt_type, opt_params, deterministic, PS):
791789
embedding_dtype=value_type,
792790
device_id=device_id,
793791
score_strategy=DynamicEmbScoreStrategy.STEP,
794-
caching=True,
795792
local_hbm_for_values=1024**3,
796793
external_storage=PS,
797794
)
@@ -976,7 +973,6 @@ def test_deterministic_insert(opt_type, opt_params, caching, PS, iteration, batc
976973
embedding_dtype=value_type,
977974
device_id=device_id,
978975
score_strategy=DynamicEmbScoreStrategy.TIMESTAMP,
979-
caching=caching,
980976
local_hbm_for_values=init_capacity * dim * 4,
981977
external_storage=PS,
982978
)
@@ -1093,7 +1089,6 @@ def test_empty_batch(opt_type, opt_params, dim, caching, deterministic, PS):
10931089
embedding_dtype=value_type,
10941090
device_id=device_id,
10951091
score_strategy=DynamicEmbScoreStrategy.TIMESTAMP,
1096-
caching=caching,
10971092
local_hbm_for_values=1024**3,
10981093
external_storage=PS,
10991094
)

corelib/dynamicemb/test/unit_tests/incremental_dump/test_batched_dynamicemb_tables.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,6 @@ def test_without_eviction(
133133
local_hbm_for_values=1024**3,
134134
score_strategy=score_strategy,
135135
num_of_buckets_per_alloc=num_embeddings[i] // bucket_capacity,
136-
caching=caching,
137136
)
138137
for i in range(table_num)
139138
]

corelib/dynamicemb/test/unit_tests/test_embedding_admission.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,7 @@ def test_admission_strategy_validation(
311311
score_strategy
312312
), # Use timestamp for admission
313313
use_index_dedup=use_index_dedup,
314-
caching=caching,
315-
cache_capacity_ratio=cache_capacity_ratio if caching else 0.1,
314+
cache_capacity_ratio=cache_capacity_ratio if caching else 1.0,
316315
admit_strategy=admission_strategy, # Pass admission strategy
317316
)
318317

corelib/dynamicemb/test/unit_tests/test_embedding_dump_load.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,6 @@ def apply_dmp(
199199
device: torch.device,
200200
score_strategy: DynamicEmbScoreStrategy = DynamicEmbScoreStrategy.LFU,
201201
use_index_dedup: bool = False,
202-
caching: bool = False,
203202
cache_capacity_ratio: float = 0.5,
204203
admit_strategy: AdmissionStrategy = None,
205204
):
@@ -213,11 +212,7 @@ def apply_dmp(
213212
tmp_type = eb_config.data_type
214213

215214
embedding_type_bytes = DATA_TYPE_NUM_BITS[tmp_type] / 8
216-
emb_num_embeddings = (
217-
eb_config.num_embeddings * cache_capacity_ratio
218-
if caching
219-
else eb_config.num_embeddings
220-
)
215+
emb_num_embeddings = eb_config.num_embeddings
221216
emb_num_embeddings_next_power_of_2 = 2 ** math.ceil(
222217
math.log2(emb_num_embeddings)
223218
) # HKV need embedding vector num is power of 2
@@ -249,12 +244,20 @@ def apply_dmp(
249244
else 0
250245
)
251246

252-
# Include optimizer state in HBM calculation
253-
total_hbm_need = (
247+
# Include optimizer state in HBM calculation.
248+
# When cache_capacity_ratio < 1, scale down so that only a
249+
# fraction of the table fits in HBM (triggers cache+storage).
250+
# When cache_capacity_ratio >= 1, use full size (all-HBM mode).
251+
full_table_hbm = (
254252
embedding_type_bytes
255253
* (dim + optimizer_state_dim)
256254
* emb_num_embeddings_next_power_of_2
257255
)
256+
total_hbm_need = int(
257+
full_table_hbm * cache_capacity_ratio
258+
if cache_capacity_ratio < 1.0
259+
else full_table_hbm
260+
)
258261

259262
admission_counter = KVCounter(
260263
max(1024 * 1024, emb_num_embeddings_next_power_of_2 // 4)
@@ -268,7 +271,6 @@ def apply_dmp(
268271
),
269272
bucket_capacity=emb_num_embeddings_next_power_of_2,
270273
max_capacity=emb_num_embeddings_next_power_of_2,
271-
caching=caching,
272274
local_hbm_for_values=1024**3,
273275
admit_strategy=admit_strategy,
274276
admission_counter=admission_counter,
@@ -308,7 +310,6 @@ def create_model(
308310
optimizer_kwargs: Dict[str, Any],
309311
score_strategy: DynamicEmbScoreStrategy = DynamicEmbScoreStrategy.LFU,
310312
use_index_dedup: bool = False,
311-
caching: bool = False,
312313
cache_capacity_ratio: float = 0.5,
313314
admit_strategy: AdmissionStrategy = None,
314315
):
@@ -344,7 +345,6 @@ def create_model(
344345
torch.device(f"cuda:{torch.cuda.current_device()}"),
345346
score_strategy=score_strategy,
346347
use_index_dedup=use_index_dedup,
347-
caching=caching,
348348
cache_capacity_ratio=cache_capacity_ratio,
349349
admit_strategy=admit_strategy,
350350
)

corelib/dynamicemb/test/unit_tests/test_lfu_scores.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,7 @@ def test_lfu_score_validation(
270270
optimizer_kwargs=optimizer_kwargs,
271271
score_strategy=DynamicEmbScoreStrategy.LFU,
272272
use_index_dedup=use_index_dedup,
273-
caching=caching,
274-
cache_capacity_ratio=cache_capacity_ratio if caching else 0.1,
273+
cache_capacity_ratio=cache_capacity_ratio if caching else 1.0,
275274
)
276275

277276
# Generate features with frequency tracking

examples/hstu/test_utils.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -577,8 +577,6 @@ def create_model(
577577
"item": DynamicEmbTableOptions(
578578
global_hbm_for_values=1024 * 1024, # 1M HBM (maybe cached)
579579
score_strategy=DynamicEmbScoreStrategy.STEP,
580-
caching=pipeline_type
581-
== "prefetch", # when prefetch is enabled, we must enable caching
582580
),
583581
}
584582
if use_dynamic_emb

0 commit comments

Comments
 (0)