NVIDIA
diff --git a/‎.gitmodules‎
Lines changed: 0 additions & 3 deletions b/‎.gitmodules‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎corelib/dynamicemb/DynamicEmb_APIs.md‎
Lines changed: 54 additions & 21 deletions b/‎corelib/dynamicemb/DynamicEmb_APIs.md‎
Lines changed: 54 additions & 21 deletions
diff --git a/‎corelib/dynamicemb/README.md‎
Lines changed: 10 additions & 9 deletions b/‎corelib/dynamicemb/README.md‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎corelib/dynamicemb/benchmark/benchmark_batched_dynamicemb_tables.py‎
Lines changed: 1 addition & 40 deletions b/‎corelib/dynamicemb/benchmark/benchmark_batched_dynamicemb_tables.py‎
Lines changed: 1 addition & 40 deletions
diff --git a/‎corelib/dynamicemb/benchmark/benchmark_bdet_results.png‎
2.87 KB b/‎corelib/dynamicemb/benchmark/benchmark_bdet_results.png‎
2.87 KB
@@ -1,6 +1,3 @@
-[submodule "third_party/HierarchicalKV"]
-	path = third_party/HierarchicalKV
-	url = https://github.com/NVIDIA-Merlin/HierarchicalKV.git
 [submodule "third_party/cutlass"]
 	path = third_party/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
@@ -11,6 +11,7 @@ This document consists of two parts, one is the introduction to the API, which c
 - [DynamicEmbCheckMode](#dynamicembcheckmode)
 - [DynamicEmbInitializerMode](#dynamicembinitializermode)
 - [DynamicEmbInitializerArgs](#dynamicembinitializerargs)
+- [DynamicEmbPoolingMode](#dynamicembpoolingmode)
 - [DynamicEmbTableOptions](#dynamicembtableoptions)
 - [DynamicEmbDump](#dynamicembdump)
 - [DynamicEmbLoad](#dynamicembload)
@@ -39,7 +40,7 @@ The `DynamicEmbParameterConstraints` function inherits from TorchREC's `Paramete
         use_dynamicemb : Optional[bool]
             A flag indicating whether to use DynamicEmb storage. Defaults to False.
         dynamicemb_options : Optional[DynamicEmbTableOptions]
-            Including HKV Configs and Initializer Args. The initialization method for the parameters.
+            Configuration for the dynamic embedding table, including initializer args.
             Common choices include "uniform", "normal", etc. Defaults to "uniform".
         """
         use_dynamicemb: Optional[bool] = False
@@ -273,8 +274,8 @@ Parameters for each random initialization method in DynamicEmbInitializerMode.
 ## DynamicEmbScoreStrategy
 
 The storage space is limited, but the value range of sparse features is relatively large, 
-so HKV introduces the concept of score to perform customized evcition of sparse features within the limited storage space.
-Based on the score of HKV, dynamicemb provides the following strategies to set the score.
+so dynamicemb introduces the concept of score to perform customized eviction of sparse features within the limited storage space.
+dynamicemb provides the following strategies to set the score.
 
     ```python
     #How to import
@@ -309,7 +310,40 @@ Based on the score of HKV, dynamicemb provides the following strategies to set t
         CUSTOMIZED = 2
     ```
 
-Users can specify the `DynamicEmbScoreStrategy` using `score_strategy` in `DynamicEmbTableOptions` per table.
+    Users can specify the `DynamicEmbScoreStrategy` using `score_strategy` in `DynamicEmbTableOptions` per table.
+
+## DynamicEmbPoolingMode
+
+DynamicEmb supports three pooling modes that determine how embedding lookups are aggregated. These modes correspond to how `EmbeddingCollection` (sequence) and `EmbeddingBagCollection` (pooled) work in TorchREC.
+
+All pooling modes use fused CUDA kernels for both forward and backward passes. Tables with different embedding dimensions (mixed-D) are fully supported in `SUM` and `MEAN` modes.
+
+    ```python
+    #How to import
+    from dynamicemb import DynamicEmbPoolingMode
+
+    #API arguments
+    class DynamicEmbPoolingMode(enum.IntEnum):
+        """
+        Enumeration for pooling modes in dynamic embedding lookup.
+
+        Attributes
+        ----------
+        SUM : int
+            Sum pooling. For each sample, the embeddings of all indices in the bag
+            are summed. Output shape: (batch_size, total_D) where total_D is the
+            sum of embedding dimensions across all features.
+        MEAN : int
+            Mean pooling. For each sample, the embeddings of all indices in the bag
+            are averaged. Output shape: same as SUM.
+        NONE : int
+            No pooling (sequence mode). Each index produces its own embedding row.
+            Output shape: (total_indices, D).
+        """
+        SUM = 0
+        MEAN = 1
+        NONE = 2
+    ```
 
 ## DynamicEmbTableOptions
 
@@ -345,18 +379,18 @@ Dynamic embedding table parameter class, used to configure the parameters for ea
         caching: bool
             Flag to indicate dynamic embedding tables is working on caching mode, default to `False`.
             When the device memory on a single GPU is insufficient to accommodate a single shard of the dynamic embedding table, 
-                HKV supports the mixed use of device memory and host memory(pinned memory).
+                dynamicemb supports the mixed use of device memory and host memory(pinned memory).
             But by default, the values of the entire table are concatenated with device memory and host memory.
-            This means that the storage location of one embeddng is determined by `hash_function(key)`, and mapping to device memory will bring better lookup performance.
+            This means that the storage location of one embedding is determined by `hash_function(key)`, and mapping to device memory will bring better lookup performance.
             However, sparse features in training are often with temporal locality.
-            In order to store hot keys in device memory, dynamicemb creates two HKV instances, 
-                whose values are stored in device memory and memory respectively, and store hot keys on the GPU table priorily. 
-            If the GPU table is full, the evicted keys will be inserted into the CPU table.
-            If the CPU table is also full, the key granularity will be evicted(all the eviction is based on the score per key). 
+            In order to store hot keys in device memory, dynamicemb creates two table instances, 
+                whose values are stored in device memory and host memory respectively, and store hot keys on the GPU table priorily. 
+            If the GPU table is full, the evicted keys will be inserted into the host table.
+            If the host table is also full, the key will be evicted(all the eviction is based on the score per key). 
             The original intention of eviction is based on this insight: features that only appear once should not occupy memory(even host memory) for a long time.
             In short:
-                set **`caching=True`** will create a GPU table and a CPU table, and make GPU table serves as a cache;
-                set **`caching=False`** will create a hybrid table which use GPU and CPU memory in a concated way to store value.
+                set **`caching=True`** will create a GPU table and a host table, and make GPU table serves as a cache;
+                set **`caching=False`** will create a hybrid table which use GPU and host memory in a concatenated way to store value.
                 All keys and other meta data are always stored on GPU for both cases.
         init_capacity : Optional[int], optional
             The initial capacity of the table. If not set, it defaults to max_capacity after sharding.
@@ -375,7 +409,7 @@ Dynamic embedding table parameter class, used to configure the parameters for ea
             For the multi-GPUs scenario of model parallelism, every rank's score_strategy should keep the same for one table,
                 as they are the same table, but stored on different ranks.
         bucket_capacity : int
-            Capacity of each bucket in HKV, and default is 128(using 1024 when HKV serves as cache).
+            Capacity of each bucket in the hash table, and default is 128 (using 1024 when the table serves as cache).
             A key will only be mapped to one bucket. 
             When the bucket is full, the key with the smallest score in the bucket will be evicted, and its slot will be used to store a new key. 
             The larger the bucket capacity, the more accurate the score based eviction will be, but it will also result in performance loss.
@@ -390,7 +424,7 @@ Dynamic embedding table parameter class, used to configure the parameters for ea
                 When `caching=True`, it decides the table capacity of the GPU table.
         external_storage: Storage
             The external storage/ParamterServer which inherits the interface of Storage, and can be configured per table.
-            If not provided, will using KeyValueTable as the Storage.
+            If not provided, will using DynamicEmbeddingTable as the Storage.
         index_type : Optional[torch.dtype], optional
             Index type of sparse features, will be set to DEFAULT_INDEX_TYPE(torch.int64) by default.
         admit_strategy : Optional[AdmissionStrategy], optional
@@ -405,8 +439,7 @@ Dynamic embedding table parameter class, used to configure the parameters for ea
         
         Notes
         -----
-        For detailed descriptions and additional context on each parameter, please refer to the documentation at
-        https://github.com/NVIDIA-Merlin/HierarchicalKV.
+        For detailed descriptions and additional context on each parameter, please refer to the documentation in this repository.
         """
 
         training: bool = True
@@ -762,7 +795,7 @@ class FrequencyAdmissionStrategy(AdmissionStrategy):
 
 Once the model containing `EmbeddingCollection` is built and initialized through `DistributedModelParallel`, it can be trained and evaluated on each GPU like a single GPU, with torchrec completing communication between different GPUs.
 
-The switching between training and evaluation modes should be consistent with `nn.Module`, while `training` in [DynamicEmbTableOptions](../dynamicemb/dynamicemb_config.py) is used to guide whether to allocate memory to optimizer states when builds the table.
+The switching between training and evaluation modes should be consistent with `nn.Module`, while `training` in [DynamicEmbTableOptions](./dynamicemb/dynamicemb_config.py) is used to guide whether to allocate memory to optimizer states when builds the table.
 
 Due to limited resources, the dynamic embedding table does not pre allocate memory for all keys. If a key appears for the first time during training, it will be initialized immediately during the training process. Please see `initializer_args` and `eval_initializer_args` in `DynamicEmbTableOptions` for more information.
 
@@ -772,12 +805,12 @@ The size of the table is finite, but the set of keys during training may be infi
 
 ## Caching and prefetch
 
-dynamicemb supports caching hot embeddings on GPU memory, and you can prefetch keys from host to device like torchrec(document and example is waiting to append, and now please see `test_prefetch_flush_in_cache` in [test prefetch](./test/test_batched_dynamic_embedding_tables_v2.py)).
+dynamicemb supports caching hot embeddings on GPU memory, and you can prefetch keys from host to device like torchrec. Caching and prefetch work for both sequence mode (`NONE`) and pooling modes (`SUM`/`MEAN`). See `test_prefetch_flush_in_cache` in [test prefetch](./test/test_batched_dynamic_embedding_tables_v2.py) for usage examples.
 
 ## External storage
 
-dynamicemb supports external storage once `external_storage` in `DynamicEmbTableOptions` inherits the `Storage` interface under [types.py](../dynamicemb/types.py). 
-Refer to demo `PyDictStorage` in [uint test](../test/test_batched_dynamic_embedding_tables_v2.py) for detailed usage.
+dynamicemb supports external storage once `external_storage` in `DynamicEmbTableOptions` inherits the `Storage` interface under [types.py](./dynamicemb/types.py). 
+Refer to demo `PyDictStorage` in [unit test](./test/test_batched_dynamic_embedding_tables_v2.py) for detailed usage.
 
 
 ## Table expansion
@@ -791,7 +824,7 @@ Dump/Load and incremental dump is different from general module in PyTorch, beca
 
 So dynamicemb provides dedicated interface to load/save models' states, and provide conditional dump to support online training.
 
-Please see `DynamicEmbDump`, `DynamicEmbLoad`, `incremental_dump` in [APIs Doc](../DynamicEmb_APIs.md) for more information.
+Please see `DynamicEmbDump`, `DynamicEmbLoad`, `incremental_dump` in [APIs Doc](./DynamicEmb_APIs.md) for more information.
 
 ## Deterministic mode
 
 
@@ -1,6 +1,6 @@
 # DynamicEmb
 
-DynamicEmb is a Python package that provides model-parallel dynamic embedding tables and embedding lookup functionalities for TorchREC, specifically targeting the sparse training aspects of recommendation systems. Currently, DynamicEmb utilizes the [HierarchicalKV](https://github.com/NVIDIA-Merlin/HierarchicalKV) hash table backend, which is designed to store key-value (feature-embedding) pairs in the high-bandwidth memory (HBM) of GPUs as well as in host memory.
+DynamicEmb is a Python package that provides model-parallel dynamic embedding tables and embedding lookup functionalities for TorchREC, specifically targeting the sparse training aspects of recommendation systems. DynamicEmb uses a GPU-optimized scored hash table backend to store key-value (feature-embedding) pairs in the high-bandwidth memory (HBM) of GPUs as well as in host memory.
 
 The lookup kernel algorithms implemented in DynamicEmb primarily leverage portions of the algorithms from the [EMBark](https://dl.acm.org/doi/abs/10.1145/3640457.3688111) paper (Embedding Optimization for Training Large-scale Deep Learning Recommendation Systems with EMBark).
 
@@ -29,6 +29,8 @@ The lookup kernel algorithms implemented in DynamicEmb primarily leverage portio
 
 - Support for creating dynamic embedding tables within `EmbeddingBagCollection` and `EmbeddingCollection` in TorchREC, allowing for embedding storage and lookup, and enabling coexistence with native Torch embedding tables within Torch models.
 
+- **Pooling Mode Support**: DynamicEmb supports `SUM`, `MEAN`, and `NONE` (sequence) pooling modes with fused CUDA kernels for both forward and backward passes. Tables with different embedding dimensions (mixed-D) are fully supported in pooling mode.
+
 - Support for optimizer types: `EXACT_SGD`,`ADAM`,`EXACT_ADAGRAD`,`EXACT_ROWWISE_ADAGRAD`.
 
 - Support for automatically parallel `dump`/`load` of embedding weights in dynamic embedding tables.
@@ -93,7 +95,7 @@ Regarding how to use the DynamicEmb APIs and their parameters, please refer to t
 3. The allocated memory for dynamic embedding tables may have slight differences from the specified `num_embeddings` because each dynamic embedding table must set a capacity as a power of 2. This will be automatically calculated by the code, so please ensure that `num_embeddings` is aligned to a power of 2 when applying.
 4. The lookup process for each dynamic embedding table incurs additional overhead from unique or radix sort operations. Therefore, if you request a large number of small dynamic embedding tables for lookup, the performance will be poor. Since the lookup range of dynamic embedding tables is particularly large (using the entire range of `int64_t`), it is recommended to create one large embedding table and perform a fused lookup for multiple features.
 5. Although dynamic embedding tables can be trained together with TorchREC tables, they cannot be fused together for embedding lookup. Therefore, it is recommended to select dynamic embedding tables for all model-parallel tables during training.
-6. Currently, DynamicEmb supports training with TorchREC's `EmbeddingBagCollection` and `EmbeddingCollection`. However, in version v0.1, the main lookup process of `EmbeddingBagCollection` is implemented using torch's ops, not fuse a lot of cuda kernels, which may result in some performance issues. Will fix this performance problem in future versions.
+6. DynamicEmb supports training with TorchREC's `EmbeddingBagCollection` (pooling mode: SUM/MEAN) and `EmbeddingCollection` (sequence mode). Both modes use fused CUDA kernels for embedding lookup and gradient reduction. Tables with different embedding dimensions are supported in pooling mode.
 
 ### DynamicEmb Insertion Behavior Checking Modes
 
@@ -106,7 +108,7 @@ To prevent this behavior from affecting training without user awareness, Dynamic
 #### Example
 
 ```python
-from dynamic_emb import DynamicEmbTableOptions, DynamicEmbCheckMode
+from dynamicemb import DynamicEmbTableOptions, DynamicEmbCheckMode
 
 # Configure the DynamicEmbTableOptions with safe check mode enabled
 table_options = DynamicEmbTableOptions(
@@ -126,12 +128,11 @@ To get started with DynamicEmb, we highly recommend checking out the [example.py
 ## Future Plans
 
 1. Support the latest version of TorchREC and continuously follow TorchREC's version updates.
-2. Continuously optimize the performance of embedding lookup and embedding bag lookup.
-3. Support multiple optimizer types, aligning with the optimizer types supported by TorchREC.
-4. Support more configurations for dynamic embedding table eviction mechanisms.
-5. Support the separation of backward and optimizer update (required by certain large language model frameworks like Megatron), to better support large-scale GR training.
-6. Add more shard types for dynamic embedding tables, including `table-wise`, `table-row-wise` and `column-wise`.
+2. Support the separation of backward and optimizer update (required by certain large language model frameworks like Megatron), to better support large-scale GR training.
+3. Add more shard types for dynamic embedding tables, including `table-wise`, `table-row-wise` and `column-wise`.
 
 ## Acknowledgements
 
-We would like to thank the Meta team and specially [Huanyu He](https://github.com/TroyGarden) for their support in [TorchRec](https://github.com/pytorch/torchrec). 
+We would like to thank the Meta team and specially [Huanyu He](https://github.com/TroyGarden) for their support in [TorchRec](https://github.com/pytorch/torchrec).
+
+We also acknowledge the [HierarchicalKV](https://github.com/NVIDIA-Merlin/HierarchicalKV) project, which inspired the scored hash table design used in DynamicEmb.
@@ -16,7 +16,6 @@
 import argparse
 import json
 import os
-from typing import cast
 
 import numpy as np
 import torch
@@ -32,8 +31,6 @@
     EmbOptimType,
 )
 from dynamicemb.batched_dynamicemb_tables import BatchedDynamicEmbeddingTablesV2
-from dynamicemb.key_value_table import KeyValueTable
-from dynamicemb_extensions import DynamicEmbTable, insert_or_assign
 from fbgemm_gpu.runtime_monitor import StdLogStatsReporterConfig
 from fbgemm_gpu.split_embedding_configs import EmbOptimType as OptimType
 from fbgemm_gpu.split_embedding_configs import SparseType
@@ -307,46 +304,10 @@ def generate_sequence_sparse_feature(args, device):
         )
 
 
-class TableShim:
-    def __init__(self, table):
-        if isinstance(table, DynamicEmbTable):
-            self.table = cast(DynamicEmbTable, table)
-        elif isinstance(table, KeyValueTable):
-            self.table = table
-        else:
-            raise ValueError("Not support table type")
-
-    def optim_states_dim(self) -> int:
-        if isinstance(self.table, DynamicEmbTable):
-            return self.table.optstate_dim()
-        else:
-            return self.table.value_dim() - self.table.embedding_dim()
-
-    def init_optim_state(self) -> float:
-        if isinstance(self.table, DynamicEmbTable):
-            return self.table.get_initial_optstate()
-        else:
-            return self.table.init_optimizer_state()
-
-    def insert(
-        self,
-        n,
-        unique_indices,
-        unique_values,
-        scores,
-    ) -> None:
-        if isinstance(self.table, DynamicEmbTable):
-            insert_or_assign(self.table, n, unique_indices, unique_values, scores)
-        else:
-            # self.table.set_score(scores[0].item())
-            self.table.insert(unique_indices, unique_values, scores)
-
-
 def create_dynamic_embedding_tables(args, device):
     table_options = []
     table_num = args.num_embedding_table
     for i in range(table_num):
-        TableModule = BatchedDynamicEmbeddingTablesV2
         table_options.append(
             DynamicEmbTableOptions(
                 index_type=torch.int64,
@@ -365,7 +326,7 @@ def create_dynamic_embedding_tables(args, device):
             )
         )
 
-    var = TableModule(
+    var = BatchedDynamicEmbeddingTablesV2(
         table_options=table_options,
         table_names=[table_idx_to_name(i) for i in range(table_num)],
         use_index_dedup=args.use_index_dedup,