Add util func for kvzch eviction mask (pytorch#4610)

EddyLXJ · facebook-github-bot · commit 6d8e16afbff2 · 2025-08-01T13:25:50.000-07:00
Summary: X-link: pytorch/torchrec#3246 X-link: facebookresearch/FBGEMM#1645 Adding a util func for kvzch to get a eviction mask using inference threshold. This is used in publish. Reviewed By: yixin94 Differential Revision: D79045178
diff --git a/fbgemm_gpu/fbgemm_gpu/kvzch_util.py b/fbgemm_gpu/fbgemm_gpu/kvzch_util.py
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+
+import torch
+from torchrec.modules.embedding_configs import (
+    CountBasedEvictionPolicy,
+    CountTimestampMixedEvictionPolicy,
+    FeatureL2NormBasedEvictionPolicy,
+    NoEvictionPolicy,
+    TimestampBasedEvictionPolicy,
+    VirtualTableEvictionPolicy,
+)
+
+
+def parse_metadata_tensor(metadata_tensor: torch.Tensor):
+    """
+    Parses a kvzch metadata tensor where each element encodes three pieces of information
+    packed into a single 64-bit integer.
+    The 64-bit integer layout is as follows:
+    - The lower 32 bits (bits 0-31) represent the 'timestamp', stored as a uint32.
+      This timestamp is typically in seconds and can represent a range of over 120 years.
+    - The upper 32 bits (bits 32-63) encode two fields packed together:
+        * The lower 31 bits of this upper half (bits 32-62 overall) represent 'count',
+          a 31-bit unsigned integer indicating a usage count or score.
+        * The highest bit of this upper half (bit 63 overall) represents 'used',
+          a boolean flag indicating whether the block is currently in use.
+    This function extracts these three components from each 64-bit integer in the tensor:
+    - 'timestamps' as a uint32 array
+    - 'counts' as a uint32 array (31 bits used)
+    - 'used' as a boolean array
+    Args:
+        metadata_tensor (torch.Tensor): A 1D tensor of dtype torch.int64, where each
+                                        element encodes timestamp, count, and used flag.
+    Returns:
+        tuple: (timestamps, counts, used)
+            - timestamps (tensor): int64 array of timestamps extracted from the tensor.
+            - counts (tensor): int64 array of counts extracted from the tensor.
+            - used (tensor): boolean array indicating usage flags extracted from the tensor.
+    """
+    assert metadata_tensor.dtype == torch.int64
+    timestamps = metadata_tensor & 0xFFFFFFFF  # Extract lower 32 bits as timestamp
+    count_used = (
+        metadata_tensor >> 32
+    )  # Extract upper 32 bits containing count and used
+    counts = count_used & 0x7FFFFFFF  # Lower 31 bits of upper half as count
+    used = ((count_used >> 31) & 1).to(
+        torch.bool
+    )  # Highest bit of upper half as used flag
+    return timestamps, counts, used
+
+
+def get_kv_zch_eviction_mask(
+    metadata_tensor: torch.Tensor,
+    eviction_policy: VirtualTableEvictionPolicy,
+):
+    """
+    Returns a boolean mask indicating which blocks should be evicted from the KV cache.
+    The eviction policy is determined by the 'eviction_policy' argument.
+    Args:
+        metadata_tensor (torch.Tensor): A 1D tensor of dtype torch.int64, where each
+                                        element encodes timestamp, count, and used flag.
+        eviction_policy (VirtualTableEvictionPolicy): The eviction policy to use.
+    Returns:
+        torch.Tensor: A 1D boolean tensor of the same size as 'metadata_tensor', where False indicates a block should be evicted.
+    """
+
+    eviction_mask = torch.ones_like(
+        metadata_tensor, dtype=torch.bool
+    )  # Initialize mask to True (keep all blocks)
+    if isinstance(eviction_policy, NoEvictionPolicy):
+        return eviction_mask
+
+    # Parse the metadata tensor to extract timestamps, counts, and used flags
+    timestamps, counts, _ = parse_metadata_tensor(metadata_tensor)
+
+    # Apply the eviction policy to determine which blocks should be evicted
+    # Check which policy is being used
+    if isinstance(eviction_policy, CountBasedEvictionPolicy):
+        inference_eviction_threshold = eviction_policy.inference_eviction_threshold
+        eviction_mask = counts >= inference_eviction_threshold
+
+    elif isinstance(eviction_policy, TimestampBasedEvictionPolicy):
+        inference_eviction_ttl_mins = eviction_policy.inference_eviction_ttl_mins
+        if inference_eviction_ttl_mins != 0:  # eviction_ttl_mins == 0 means no eviction
+            current_time = int(time.time())
+            eviction_mask = (
+                current_time - timestamps
+            ) <= inference_eviction_ttl_mins * 60
+
+    elif isinstance(eviction_policy, CountTimestampMixedEvictionPolicy):
+        inference_eviction_threshold = eviction_policy.inference_eviction_threshold
+        inference_eviction_ttl_mins = eviction_policy.inference_eviction_ttl_mins
+        current_time = int(time.time())
+        eviction_ttl_secs = inference_eviction_ttl_mins * 60
+        if inference_eviction_threshold == 0:
+            count_mask = torch.ones_like(counts, dtype=torch.bool)
+        else:
+            count_mask = counts >= inference_eviction_threshold
+
+        if inference_eviction_ttl_mins == 0:
+            timestamp_mask = torch.ones_like(counts, dtype=torch.bool)
+        else:
+            timestamp_mask = (current_time - timestamps) <= eviction_ttl_secs
+        eviction_mask = count_mask & timestamp_mask
+
+    elif isinstance(eviction_policy, FeatureL2NormBasedEvictionPolicy):
+        # Feature L2 norm-based eviction logic
+        # No op for now
+        pass
+    else:
+        raise ValueError("Unsupported eviction policy")
+
+    return eviction_mask
diff --git a/fbgemm_gpu/requirements.txt b/fbgemm_gpu/requirements.txt
@@ -28,3 +28,4 @@ setuptools_git_versioning
 tabulate
 patchelf
 fairscale
+torchrec
diff --git a/fbgemm_gpu/requirements_genai.txt b/fbgemm_gpu/requirements_genai.txt
@@ -30,3 +30,4 @@ setuptools_git_versioning
 tabulate
 patchelf
 fairscale
+torchrec
diff --git a/fbgemm_gpu/test/tbe/ssd/kvzch_util_test.py b/fbgemm_gpu/test/tbe/ssd/kvzch_util_test.py
@@ -0,0 +1,143 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import time
+import unittest
+
+import numpy as np
+import torch
+
+from fbgemm_gpu.kvzch_util import get_kv_zch_eviction_mask, parse_metadata_tensor
+from torchrec.modules.embedding_configs import (
+    CountBasedEvictionPolicy,
+    CountTimestampMixedEvictionPolicy,
+    NoEvictionPolicy,
+    TimestampBasedEvictionPolicy,
+    VirtualTableEvictionPolicy,
+)
+
+from ..common import gpu_unavailable
+
+
+@unittest.skipIf(*gpu_unavailable)
+class KvzchUtilsTest(unittest.TestCase):
+    def test_basic_parsing(self) -> None:
+        """
+        Test typical parsing including used=0 and used=1 cases.
+        """
+        # Compose metadata values as 64-bit integers:
+        # [timestamp=7, count=13, used=0]
+        v1 = 7 | (13 << 32)  # used=0 (highest bit not set)
+        # [timestamp=42, count=99, used=1]
+        # Used=1 is highest bit; encode as a negative int64 in Python to avoid overflow
+        v2 = (42 | (99 << 32)) - (1 << 63)  # set highest bit
+        # [timestamp=0xABCDEF01, count=0x1ABCDE0, used=1]
+        v3 = (0xABCDEF01 | (0x1ABCDE0 << 32)) - (1 << 63)
+        vals = [v1, v2, v3]
+        tensor = torch.tensor(vals, dtype=torch.int64)
+
+        timestamps, counts, used = parse_metadata_tensor(tensor)
+
+        np.testing.assert_array_equal(
+            timestamps.numpy(), np.array([7, 42, 0xABCDEF01], dtype=np.uint32)
+        )
+        np.testing.assert_array_equal(
+            counts.numpy(), np.array([13, 99, 0x1ABCDE0], dtype=np.uint32)
+        )
+        np.testing.assert_array_equal(
+            used.numpy(), np.array([False, True, True], dtype=bool)
+        )
+
+    def test_edge_cases(self) -> None:
+        """
+        Test edge cases including all zeros, max values, min values, and different used flags.
+        """
+        # All fields zero, used=0
+        v1 = 0
+        # Max timestamp, max count, used=0
+        v2 = 0xFFFFFFFF | (0x7FFFFFFF << 32)  # Used=0 (highest bit = 0)
+        # Min timestamp, min count, used=1
+        v3 = 0 - (1 << 63)  # All fields 0, only highest bit set (used=1)
+
+        vals = [v1, v2, v3]
+        tensor = torch.tensor(vals, dtype=torch.int64)
+
+        timestamps, counts, used = parse_metadata_tensor(tensor)
+
+        np.testing.assert_array_equal(
+            timestamps.numpy(), np.array([0, 0xFFFFFFFF, 0], dtype=np.uint32)
+        )
+        np.testing.assert_array_equal(
+            counts.numpy(), np.array([0, 0x7FFFFFFF, 0], dtype=np.uint32)
+        )
+        np.testing.assert_array_equal(
+            used.numpy(), np.array([False, False, True], dtype=bool)
+        )
+
+    def test_invalid_dtype(self) -> None:
+        """
+        Test that an assertion is raised for wrong dtype.
+        """
+        tensor = torch.tensor([1, 2, 3], dtype=torch.float32)
+        with self.assertRaises(AssertionError):
+            parse_metadata_tensor(tensor)
+
+
+class GetKvZchEvictionMaskTest(unittest.TestCase):
+    def setUp(self) -> None:
+        # Prepare some metadata values with timestamp, count, used
+        # Use negative numbers to represent highest bit set (used=1)
+        self.vals = [
+            (100 | (5 << 32)),  # used=0
+            (int(time.time()) - 60 | (10 << 32))
+            - (1 << 63),  # used=1, timestamp 1 min ago
+            (int(time.time()) - 3600 | (15 << 32))
+            - (1 << 63),  # used=1, timestamp 1 hour ago
+        ]
+        self.metadata_tensor = torch.tensor(self.vals, dtype=torch.int64)
+
+    def test_count_based_eviction(self) -> None:
+        policy = CountBasedEvictionPolicy(inference_eviction_threshold=10)
+        mask = get_kv_zch_eviction_mask(self.metadata_tensor, policy)
+        # counts are 5,10,15; threshold=10; keep counts >= 10
+        expected = torch.tensor([False, True, True], dtype=torch.bool)
+        self.assertTrue(torch.equal(mask, expected))
+
+    def test_timestamp_based_eviction(self) -> None:
+        policy = TimestampBasedEvictionPolicy(inference_eviction_ttl_mins=30)
+        mask = get_kv_zch_eviction_mask(self.metadata_tensor, policy)
+        # timestamps: 100 (old), now-60s, now-3600s
+        # TTL=30min=1800s, keep timestamps within 1800s
+        expected = torch.tensor([False, True, False], dtype=torch.bool)
+        self.assertTrue(torch.equal(mask, expected))
+
+    def test_count_timestamp_mixed_eviction(self) -> None:
+        policy = CountTimestampMixedEvictionPolicy(
+            inference_eviction_threshold=10, inference_eviction_ttl_mins=30
+        )
+        mask = get_kv_zch_eviction_mask(self.metadata_tensor, policy)
+        # count mask: counts >= 10 -> [False, True, True]
+        # timestamp mask: within 1800s -> [False, True, False]
+        # combined mask = count_mask & timestamp_mask
+        expected = torch.tensor([False, True, False], dtype=torch.bool)
+        self.assertTrue(torch.equal(mask, expected))
+
+    def test_no_eviction_policy(self) -> None:
+        policy = NoEvictionPolicy()
+        mask = get_kv_zch_eviction_mask(self.metadata_tensor, policy)
+        # No eviction, mask all True
+        expected = torch.ones_like(self.metadata_tensor, dtype=torch.bool)
+        self.assertTrue(torch.equal(mask, expected))
+
+    def test_unsupported_policy(self) -> None:
+        class DummyPolicy(VirtualTableEvictionPolicy):
+            pass
+
+        policy = DummyPolicy()
+        with self.assertRaises(ValueError):
+            get_kv_zch_eviction_mask(self.metadata_tensor, policy)