Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,6 @@ def create_dynamic_embedding_tables(args, device):
score_strategy=DynamicEmbScoreStrategy.LFU
if args.cache_algorithm == "lfu"
else DynamicEmbScoreStrategy.TIMESTAMP,
caching=args.caching,
)
)

Expand Down
50 changes: 30 additions & 20 deletions corelib/dynamicemb/dynamicemb/batched_dynamicemb_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def __init__(
def _create_cache_storage(self) -> None:
self._storages: List[Storage] = []
self._caches: List[Cache] = []
self._caching = self._dynamicemb_options[0].caching
self._caching = False

for option in self._dynamicemb_options:
if option.training and option.optimizer_type == OptimizerType.Null:
Expand All @@ -539,23 +539,36 @@ def _create_cache_storage(self) -> None:
"Set OptimizerType to Null as not on training mode.", UserWarning
)

if option.caching and option.training:
cache_option = deepcopy(option)
cache_option.bucket_capacity = 1024
capacity = get_constraint_capacity(
option.local_hbm_for_values,
option.embedding_dtype,
option.dim,
option.optimizer_type,
cache_option.bucket_capacity,
)
if capacity == 0:
raise ValueError(
"Can't use caching mode as the reserved HBM size is too small."
)
value_size = get_value_size(
option.embedding_dtype,
option.dim,
option.optimizer_type,
)
total_table_bytes = value_size * option.max_capacity
hbm_budget = option.local_hbm_for_values

if hbm_budget == 0:
# No HBM budget -> storage only, on host
option.local_hbm_for_values = 0
self._caches.append(None)
self._storages.append(DynamicEmbeddingTable(option, self._optimizer))
elif total_table_bytes <= hbm_budget:
# Entire table fits in HBM -> single table on GPU serves as
# both cache and storage (no eviction needed).
self._caching = True
table = DynamicEmbeddingTable(option, self._optimizer)
self._caches.append(table)
self._storages.append(table)
else:
# Partial HBM -> cache (GPU) + storage (host or external)
self._caching = True
bucket_capacity_for_cache = 1024
cache_capacity = hbm_budget // value_size

cache_option.max_capacity = capacity
cache_option.init_capacity = capacity
cache_option = deepcopy(option)
cache_option.bucket_capacity = bucket_capacity_for_cache
cache_option.max_capacity = cache_capacity
cache_option.init_capacity = cache_capacity
self._caches.append(
DynamicEmbeddingTable(cache_option, self._optimizer)
)
Expand All @@ -568,9 +581,6 @@ def _create_cache_storage(self) -> None:
if PS
else DynamicEmbeddingTable(storage_option, self._optimizer)
)
else:
self._caches.append(None)
self._storages.append(DynamicEmbeddingTable(option, self._optimizer))

_print_memory_consume(
self._table_names, self._dynamicemb_options, self._optimizer, self.device_id
Expand Down
48 changes: 27 additions & 21 deletions corelib/dynamicemb/dynamicemb/dynamicemb_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,22 +222,6 @@ class DynamicEmbTableOptions(_ContextOptions):
For `UNIFORM` and `TRUNCATED_NORMAL`, the `lower` and `upper` will set to $\pm {1 \over \sqrt{EmbeddingConfig.num\_embeddings}}$.
eval_initializer_args: DynamicEmbInitializerArgs
The initializer args for evaluation mode, and will return torch.zeros(...) as embedding by default if index/sparse feature is missing.
caching: bool
Flag to indicate dynamic embedding tables is working on caching mode, default to `False`.
When the device memory on a single GPU is insufficient to accommodate a single shard of the dynamic embedding table,
dynamicemb supports the mixed use of device memory and host memory(pinned memory).
But by default, the values of the entire table are concatenated with device memory and host memory.
This means that the storage location of one embedding is determined by `hash_function(key)`, and mapping to device memory will bring better lookup performance.
However, sparse features in training are often with temporal locality.
In order to store hot keys in device memory, dynamicemb creates two table instances,
whose values are stored in device memory and host memory respectively, and store hot keys on the GPU table priorily.
If the GPU table is full, the evicted keys will be inserted into the host table.
If the host table is also full, the key will be evicted(all the eviction is based on the score per key).
The original intention of eviction is based on this insight: features that only appear once should not occupy memory(even host memory) for a long time.
In short:
set **`caching=True`** will create a GPU table and a host table, and make GPU table serves as a cache;
set **`caching=False`** will create a hybrid table which use GPU and host memory in a concatenated way to store value.
All keys and other meta data are always stored on GPU for both cases.
init_capacity : Optional[int], optional
The initial capacity of the table. If not set, it defaults to max_capacity after sharding.
If `init_capacity` is provided, it will serve as the initial table capacity on a single GPU.
Expand Down Expand Up @@ -265,9 +249,9 @@ class DynamicEmbTableOptions(_ContextOptions):
Please refer to the API documentation for DynamicEmbCheckMode for more information.
global_hbm_for_values : int
Total GPU memory allocated to store embedding + optimizer states, in bytes. Default is 0.
It has different meanings under `caching=True` and `caching=False`.
When `caching=False`, it decides how much GPU memory is in the total memory to store value in a single hybrid table.
When `caching=True`, it decides the table capacity of the GPU table.
If the budget can hold the entire table (max_capacity * value_size), the table lives entirely on GPU.
If the budget is nonzero but smaller, it determines the GPU cache capacity while the full table is stored on host/external storage.
If zero, the table is stored entirely on host memory.
external_storage: Storage
The external storage/ParamterServer which inherits the interface of Storage, and can be configured per table.
If not provided, will using DynamicEmbeddingTable as the Storage.
Expand Down Expand Up @@ -297,7 +281,6 @@ class DynamicEmbTableOptions(_ContextOptions):
value=0.0,
)
)
caching: bool = False
init_capacity: Optional[
int
] = None # if not set then set to max_capcacity after sharded
Expand Down Expand Up @@ -339,7 +322,6 @@ def __ne__(self, other):
def get_grouped_key(self):
grouped_key = {}
grouped_key["training"] = self.training
grouped_key["caching"] = self.caching
grouped_key["external_storage"] = self.external_storage
grouped_key["index_type"] = self.index_type
grouped_key["score_strategy"] = self.score_strategy
Expand Down Expand Up @@ -498,6 +480,30 @@ def validate_initializer_args(
initializer_args.upper = default_upper


def get_comsued_bytes_of_table(
max_capacity,
dtype,
dim,
optimizer_type,
) -> int:
byte_consume_per_vector = (
dim + get_optimizer_state_dim(optimizer_type, dim, dtype)
) * dtype_to_bytes(dtype)
total_consumed = max_capacity * byte_consume_per_vector
return total_consumed


def get_value_size(
dtype,
dim,
optimizer_type,
) -> int:
byte_consume_per_vector = (
dim + get_optimizer_state_dim(optimizer_type, dim, dtype)
) * dtype_to_bytes(dtype)
return byte_consume_per_vector


def get_constraint_capacity(
memory_bytes,
dtype,
Expand Down
Loading