enable feature score data collection in torchrec (meta-pytorch#3285)

emlin · facebook-github-bot · commit f9d4bbf4be67 · 2025-08-19T17:28:52.000-07:00
Summary: Pull Request resolved: meta-pytorch#3285 Add enable_feature_score_weight_accumulation flag to ShardedEmbeddingCollection. When this flag is true, and dedup ec index is true, we'll accumulate kjt weight and count and reset back to kjt weight, to allow input dist to distribute feature score. this change is part of ZCH v.Next feature score eviction story: - collect score for every feature id in model, e.g. for positive id set to 0.5, and negative id set to 0.2. - set score as the input id list feature kjt's weight value - in EC forward, if there is ID dedup, aggregate the id score and occurrence of each id. - distribute the id score in kjt weight - in KVZCH embedding kernel, call forward with weight as an optional parameter in ZCH TBE backend (separate diffs): - set the feature score to ZCH TBE backend - run eviction based on the id score value for the whole story, please reference here: https://docs.google.com/document/d/1TJHKvO1m3-5tYAKZGhacXnGk7iCNAzz7wQlrFbX_LDI/edit?tab=t.0 Reviewed By: duduyi2013 Differential Revision: D79864431 fbshipit-source-id: 4830ff41c79770e83d20a7e49f84a33f938870e4
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -88,7 +88,6 @@
     NoEvictionPolicy,
     pooling_type_to_pooling_mode,
     TimestampBasedEvictionPolicy,
-    VirtualTableEvictionPolicy,
 )
 from torchrec.optim.fused import (
     EmptyFusedOptimizer,
@@ -1713,7 +1712,11 @@ def forward(self, features: KeyedJaggedTensor) -> torch.Tensor:
         self._split_weights_res = None
         self._optim.set_sharded_embedding_weight_ids(sharded_embedding_weight_ids=None)
 
-        return super().forward(features)
+        return self.emb_module(
+            indices=features.values().long(),
+            offsets=features.offsets().long(),
+            weights=features.weights_or_none(),
+        )
 
 
 class BatchedFusedEmbedding(BaseBatchedEmbedding[torch.Tensor], FusedOptimizerModule):
diff --git a/torchrec/distributed/embedding.py b/torchrec/distributed/embedding.py
@@ -47,6 +47,7 @@
     ShardingType,
 )
 from torchrec.distributed.fused_params import (
+    ENABLE_FEATURE_SCORE_WEIGHT_ACCUMULATION,
     FUSED_PARAM_IS_SSD_TABLE,
     FUSED_PARAM_SSD_TABLE_LIST,
 )
@@ -419,6 +420,20 @@ def __init__(
         module_fqn: Optional[str] = None,
     ) -> None:
         super().__init__(qcomm_codecs_registry=qcomm_codecs_registry)
+        self._enable_feature_score_weight_accumulation: bool = False
+
+        if (
+            fused_params is not None
+            and ENABLE_FEATURE_SCORE_WEIGHT_ACCUMULATION in fused_params
+        ):
+            self._enable_feature_score_weight_accumulation = cast(
+                bool, fused_params[ENABLE_FEATURE_SCORE_WEIGHT_ACCUMULATION]
+            )
+            fused_params.pop(ENABLE_FEATURE_SCORE_WEIGHT_ACCUMULATION)
+            logger.info(
+                f"EC feature score weight accumulation enabled: {self._enable_feature_score_weight_accumulation}."
+            )
+
         self._module_fqn = module_fqn
         self._embedding_configs: List[EmbeddingConfig] = module.embedding_configs()
         self._table_names: List[str] = [
@@ -1321,11 +1336,32 @@ def _dedup_indices(
                     input_feature.offsets().to(torch.int64),
                     input_feature.values().to(torch.int64),
                 )
+                acc_weights = None
+                if (
+                    self._enable_feature_score_weight_accumulation
+                    and input_feature.weights_or_none() is not None
+                ):
+                    source_weights = input_feature.weights()
+                    assert (
+                        source_weights.dtype == torch.float32
+                    ), "Only float32 weights are supported for feature score eviction weights."
+
+                    acc_weights = torch.ops.fbgemm.jagged_acc_weights_and_counts(
+                        source_weights.view(-1),
+                        reverse_indices,
+                        unique_indices.numel(),
+                    )
+
                 dedup_features = KeyedJaggedTensor(
                     keys=input_feature.keys(),
                     lengths=lengths,
                     offsets=offsets,
                     values=unique_indices,
+                    weights=(
+                        acc_weights.view(torch.float64).view(-1)
+                        if acc_weights is not None
+                        else None
+                    ),
                 )
 
                 ctx.input_features.append(input_feature)
diff --git a/torchrec/distributed/fused_params.py b/torchrec/distributed/fused_params.py
@@ -33,6 +33,9 @@
 FUSED_PARAM_SSD_TABLE_LIST: str = "__register_ssd_table_list"
 # Bool fused param per table to check if the table is offloaded to SSD
 FUSED_PARAM_IS_SSD_TABLE: str = "__register_is_ssd_table"
+ENABLE_FEATURE_SCORE_WEIGHT_ACCUMULATION: str = (
+    "enable_feature_score_weight_accumulation"
+)
 
 
 class TBEToRegisterMixIn: