propagate shard offsets for KV ZCH inference operator (#3178)

emlin · facebook-github-bot · commit 51078e82e51f · 2025-07-09T20:24:38.000-07:00
Summary: Pull Request resolved: #3178 populate ZCH v.Next sharding offset to inference operator during publish, this offset will be used during weight loading in inference side. Reviewed By: chenyuzhcy Differential Revision: D77989209 fbshipit-source-id: 95fdb750e109dc17eaeea264133437100819ed60
diff --git a/torchrec/distributed/embedding_lookup.py b/torchrec/distributed/embedding_lookup.py
@@ -1169,10 +1169,23 @@ def __init__(
         self._is_empty_rank: List[bool] = []
         for rank in range(world_size):
             empty_rank = len(grouped_configs_per_rank[rank]) == 0
-            # Propagate shard index to get the correct runtime_device based on shard metadata
-            # in case of heterogenous sharding of a single table across different device types
+            grouped_configs_per_rank_elem = grouped_configs_per_rank[rank]
+            contains_virtual_table = any(
+                config.is_using_virtual_table()
+                for config in grouped_configs_per_rank_elem
+            )
+            # In case of heterogenous sharding of a single table acorss
+            # different device types i.e. when device_type_from_sharding_infos
+            # is a tuple OR if any of the table is virtual table, we can for
+            # now assume that the table is row_wise sharded and the shard_index
+            # can be set to the rank. shard_index is used downstream to get
+            # runtime_device (or row alignment) as well as to get the shard
+            # offsets for virtual table
             shard_index = (
-                rank if isinstance(device_type_from_sharding_infos, tuple) else None
+                rank
+                if isinstance(device_type_from_sharding_infos, tuple)
+                or contains_virtual_table
+                else None
             )
             self._is_empty_rank.append(empty_rank)
             if not empty_rank:
@@ -1235,10 +1248,23 @@ def __init__(
                 "meta" if device is not None and device.type == "meta" else "cuda"
             )
         for rank in range(world_size):
-            # propagate shard index to get the correct runtime_device based on shard metadata
-            # in case of heterogenous sharding of a single table acorss different device types
+            grouped_configs_per_rank_elem = grouped_configs_per_rank[rank]
+            contains_virtual_table = any(
+                config.is_using_virtual_table()
+                for config in grouped_configs_per_rank_elem
+            )
+            # In case of heterogenous sharding of a single table acorss
+            # different device types i.e. when device_type_from_sharding_infos
+            # is a tuple OR if any of the table is virtual table, we can for
+            # now assume that the table is row_wise sharded and the shard_index
+            # can be set to the rank. shard_index is used downstream to get
+            # runtime_device (or row alignment) as well as to get the shard
+            # offsets for virtual table
             shard_index = (
-                rank if isinstance(device_type_from_sharding_infos, tuple) else None
+                rank
+                if isinstance(device_type_from_sharding_infos, tuple)
+                or contains_virtual_table
+                else None
             )
             device = rank_device(device_type, rank)
             self._embedding_lookups_per_rank.append(
diff --git a/torchrec/distributed/embedding_sharding.py b/torchrec/distributed/embedding_sharding.py
@@ -564,6 +564,7 @@ def _group_tables_per_rank(
                     table.data_type,
                 ),
                 _prefetch_and_cached(table),
+                table.use_virtual_table if is_inference else None,
             )
             # micromanage the order of we traverse the groups to ensure backwards compatibility
             if grouping_key not in groups:
@@ -579,6 +580,7 @@ def _group_tables_per_rank(
                 compute_kernel_type,
                 _,
                 _,
+                use_virtual_table,
             ) = grouping_key
             grouped_tables = groups[grouping_key]
             # remove non-native fused params
diff --git a/torchrec/distributed/embedding_types.py b/torchrec/distributed/embedding_types.py
@@ -301,6 +301,9 @@ def embedding_shard_metadata(self) -> List[Optional[ShardMetadata]]:
                 embedding_shard_metadata.append(table.local_metadata)
         return embedding_shard_metadata
 
+    def is_using_virtual_table(self) -> bool:
+        return any(table.use_virtual_table for table in self.embedding_tables)
+
 
 F = TypeVar("F", bound=Multistreamable)
 T = TypeVar("T")
diff --git a/torchrec/distributed/quant_embedding_kernel.py b/torchrec/distributed/quant_embedding_kernel.py
@@ -120,6 +120,30 @@ def _quantize_weight(
     return quant_weight_list
 
 
+def _get_shard_offsets_for_kv_zch(
+    config: GroupedEmbeddingConfig,
+    shard_index: int,
+) -> List[int]:
+    """
+    Given kv zch tables are rw sharded, getting the row offsets for each shard
+    at level to be used witin kv zch look up kernel
+    """
+    shard_row_offsets = []
+    for table in config.embedding_tables:
+        assert (
+            table.global_metadata is not None
+        ), f"Expected global_metadata to be populated for table {table.name} to get shard offsets for kv zch look up kernel"
+        assert (
+            len(table.global_metadata.shards_metadata) > shard_index
+        ), f"Expected table {table.name} to have more shards than shard index {shard_index}. Found {len(table.global_metadata.shards_metadata)} shards"
+        shard_row_offsets.append(
+            # pyre-ignore: Undefined attribute [16]
+            table.global_metadata.shards_metadata[shard_index].shard_offsets[0]
+        )
+    logger.info(f"Shard row offsets for kv zch look up table: {shard_row_offsets=}")
+    return shard_row_offsets
+
+
 def _get_runtime_device(
     device: Optional[torch.device],
     config: GroupedEmbeddingConfig,
@@ -293,6 +317,16 @@ def __init__(
         else:
             tbe_clazz = IntNBitTableBatchedEmbeddingBagsCodegen
 
+        if is_virtual_table:
+            assert (
+                shard_index is not None and shard_index >= 0
+            ), "valid shard_index must be provided for kv zch batch embedding to compute shard offsets"
+            shard_offsets_for_kv_zch = _get_shard_offsets_for_kv_zch(
+                config, shard_index
+            )
+        else:
+            shard_offsets_for_kv_zch = None
+
         self._emb_module: IntNBitTableBatchedEmbeddingBagsCodegen = tbe_clazz(
             embedding_specs=embedding_specs,
             device=device,
@@ -310,6 +344,12 @@ def __init__(
         )
         if device is not None:
             self._emb_module.initialize_weights()
+        if shard_offsets_for_kv_zch is not None:
+            assert (
+                tbe_clazz == KVEmbeddingInference
+            ), "shard_offsets_for_kv_zch should be computed only for kv zch kernel"
+            # pyre-ignore: Call error [29]
+            self._emb_module.init_tbe_config(shard_offsets_for_kv_zch)
 
     def init_parameters(self) -> None:
         pass
@@ -479,6 +519,16 @@ def __init__(
             if is_virtual_table
             else IntNBitTableBatchedEmbeddingBagsCodegen
         )
+        if is_virtual_table:
+            assert (
+                shard_index is not None and shard_index >= 0
+            ), "valid shard_index must be provided for kv zch batch embedding to compute shard offsets"
+            shard_offsets_for_kv_zch = _get_shard_offsets_for_kv_zch(
+                config, shard_index
+            )
+        else:
+            shard_offsets_for_kv_zch = None
+
         self._emb_module: IntNBitTableBatchedEmbeddingBagsCodegen = embedding_clazz(
             embedding_specs=[
                 (
@@ -511,6 +561,12 @@ def __init__(
         )
         if device is not None:
             self._emb_module.initialize_weights()
+        if shard_offsets_for_kv_zch is not None:
+            assert (
+                embedding_clazz == KVEmbeddingInference
+            ), "shard_offsets_for_kv_zch should be computed only for kv zch kernel"
+            # pyre-ignore: Call error [29]
+            self._emb_module.init_tbe_config(shard_offsets_for_kv_zch)
 
     @property
     def emb_module(
diff --git a/torchrec/distributed/sharding/tw_sharding.py b/torchrec/distributed/sharding/tw_sharding.py
@@ -184,6 +184,7 @@ def _shard(
                     weight_init_min=info.embedding_config.weight_init_min,
                     fused_params=info.fused_params,
                     num_embeddings_post_pruning=info.embedding_config.num_embeddings_post_pruning,
+                    use_virtual_table=info.embedding_config.use_virtual_table,
                 )
             )
         return tables_per_rank
diff --git a/torchrec/distributed/sharding/twrw_sharding.py b/torchrec/distributed/sharding/twrw_sharding.py
@@ -204,6 +204,7 @@ def _shard(
                         weight_init_max=info.embedding_config.weight_init_max,
                         weight_init_min=info.embedding_config.weight_init_min,
                         fused_params=info.fused_params,
+                        use_virtual_table=info.embedding_config.use_virtual_table,
                     )
                 )
 
diff --git a/torchrec/distributed/tests/test_quant_sequence_model_parallel.py b/torchrec/distributed/tests/test_quant_sequence_model_parallel.py
@@ -13,6 +13,10 @@
 
 import hypothesis.strategies as st
 import torch
+from fbgemm_gpu.split_table_batched_embeddings_ops_inference import (
+    IntNBitTableBatchedEmbeddingBagsCodegen,
+)
+from fbgemm_gpu.tbe.cache.kv_embedding_ops_inference import KVEmbeddingInference
 from hypothesis import given, settings, Verbosity
 from torch import nn, quantization as quant
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
@@ -24,7 +28,7 @@
 )
 from torchrec.distributed.tests.test_sequence_model import TestSequenceSparseNN
 from torchrec.distributed.types import ModuleSharder, ShardingEnv, ShardingType
-from torchrec.modules.embedding_configs import EmbeddingConfig
+from torchrec.modules.embedding_configs import EmbeddingConfig, NoEvictionPolicy
 from torchrec.modules.embedding_modules import EmbeddingCollection
 from torchrec.quant.embedding_modules import (
     EmbeddingCollection as QuantEmbeddingCollection,
@@ -203,3 +207,96 @@ def test_quant_pred_shard(
         )
         local_batch = local_batch.to(device)
         sharded_quant_model(local_batch.idlist_features)
+
+    # pyre-fixme[56]
+    @unittest.skipIf(
+        torch.cuda.device_count() <= 1,
+        "Not enough GPUs available",
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=1, deadline=None)
+    def test_sharded_quant_kv_zch(self) -> None:
+        device = torch.device("cuda:0")
+        num_features = 4
+
+        tables = [
+            EmbeddingConfig(
+                num_embeddings=(i + 1) * 11,
+                embedding_dim=16,
+                name="table_" + str(i),
+                feature_names=["feature_" + str(i)],
+                use_virtual_table=True if i % 2 == 0 else False,
+                virtual_table_eviction_policy=(
+                    NoEvictionPolicy() if i % 2 == 0 else None
+                ),
+            )
+            for i in range(num_features)
+        ]
+        # wrap in sequential because _quantize only applies to submodules...
+        model = nn.Sequential(EmbeddingCollection(tables=tables, device=device))
+
+        quant_model = _quantize(model, quant_state_dict_split_scale_bias=True)
+
+        sharded_quant_model = _shard_modules(
+            module=quant_model,
+            sharders=[
+                cast(
+                    ModuleSharder[torch.nn.Module],
+                    TestQuantECSharder(
+                        sharding_type=ShardingType.ROW_WISE.value,
+                        kernel_type=EmbeddingComputeKernel.QUANT.value,
+                    ),
+                )
+            ],
+            device=device,
+            env=ShardingEnv.from_local(world_size=2, rank=0),
+        )
+
+        sharded_quant_model.load_state_dict(sharded_quant_model.state_dict())
+
+        local_batch, _ = ModelInput.generate(
+            batch_size=16,
+            world_size=1,
+            num_float_features=10,
+            tables=self.tables,
+            weighted_tables=[],
+            indices_dtype=torch.int32,
+            lengths_dtype=torch.int32,
+        )
+        local_batch = local_batch.to(device)
+        sharded_quant_model(local_batch.idlist_features)
+        self.assertIsInstance(
+            # pyre-ignore [29]
+            sharded_quant_model[0]
+            ._lookups[0]
+            ._embedding_lookups_per_rank[0]
+            ._emb_modules[0]
+            ._emb_module,
+            KVEmbeddingInference,
+        )
+        self.assertIsInstance(
+            # pyre-ignore [29]
+            sharded_quant_model[0]
+            ._lookups[0]
+            ._embedding_lookups_per_rank[0]
+            ._emb_modules[1]
+            ._emb_module,
+            IntNBitTableBatchedEmbeddingBagsCodegen,
+        )
+        self.assertEqual(
+            # pyre-ignore [29]
+            sharded_quant_model[0]
+            ._lookups[0]
+            ._embedding_lookups_per_rank[0]
+            ._emb_modules[0]
+            ._emb_module.table_sharding_offset,
+            [0, 0],
+        )
+        self.assertEqual(
+            # pyre-ignore [29]
+            sharded_quant_model[0]
+            ._lookups[0]
+            ._embedding_lookups_per_rank[1]
+            ._emb_modules[0]
+            ._emb_module.table_sharding_offset,
+            [6, 17],
+        )

Original file line number	Diff line number	Diff line change
`@@ -184,6 +184,7 @@ def _shard(`
`184`	`184`	`weight_init_min=info.embedding_config.weight_init_min,`
`185`	`185`	`fused_params=info.fused_params,`
`186`	`186`	`num_embeddings_post_pruning=info.embedding_config.num_embeddings_post_pruning,`
	`187`	`+ use_virtual_table=info.embedding_config.use_virtual_table,`
`187`	`188`	`)`
`188`	`189`	`)`
`189`	`190`	`return tables_per_rank`
Original file line number	Diff line number	Diff line change
`@@ -204,6 +204,7 @@ def _shard(`
`204`	`204`	`weight_init_max=info.embedding_config.weight_init_max,`
`205`	`205`	`weight_init_min=info.embedding_config.weight_init_min,`
`206`	`206`	`fused_params=info.fused_params,`
	`207`	`+ use_virtual_table=info.embedding_config.use_virtual_table,`
`207`	`208`	`)`
`208`	`209`	`)`
`209`	`210`