vllm-project
diff --git a/‎src/compressed_tensors/offload/cache/disk.py‎
Lines changed: 11 additions & 3 deletions b/‎src/compressed_tensors/offload/cache/disk.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎tests/test_offload/cache/conftest.py‎
Lines changed: 19 additions & 0 deletions b/‎tests/test_offload/cache/conftest.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎tests/test_offload/cache/helpers.py‎
Lines changed: 59 additions & 64 deletions b/‎tests/test_offload/cache/helpers.py‎
Lines changed: 59 additions & 64 deletions
diff --git a/‎tests/test_offload/cache/test_cpu.py‎
Lines changed: 25 additions & 26 deletions b/‎tests/test_offload/cache/test_cpu.py‎
Lines changed: 25 additions & 26 deletions
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-import tempfile
 from typing import TYPE_CHECKING, Optional
 
 import torch
@@ -39,7 +38,12 @@ class DiskCache(OffloadCache):
 
     def __init__(self, onload_device: torch.device, offload_dir: Optional[str] = None):
         super().__init__(onload_device)
-        self.offload_dir = offload_dir or tempfile.mkdtemp()
+        if offload_dir is None:
+            raise ValueError(
+                "Must provide an `offload_dir` to perform disk offloading "
+                "(add `offload_folder` argument to `from_pretrained`)"
+            )
+        self.offload_dir = offload_dir
 
     def onload(self, offloaded: torch.Tensor | None) -> torch.Tensor | None:
         """
@@ -139,7 +143,11 @@ def create_checkpoint_symlink(
         offload_dir: str | os.PathLike | None,
     ) -> None:
         assert is_rank0(), "Must call on rank 0 to avoid id collisions between ranks"
-        offload_dir = offload_dir or tempfile.mkdtemp()
+        if offload_dir is None:
+            raise ValueError(
+                "Must provide an `offload_dir` to perform disk offloading "
+                "(add `offload_folder` argument to `from_pretrained`)"
+            )
         file_name = f"{cls._new_file_prefix}{id(offloaded)}.safetensors"
         file_path = os.path.join(offload_dir, file_name)
 
 
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import pytest
+from compressed_tensors.offload import OffloadCache
+
+
+@pytest.fixture()
+def offload_cache(offload_device, onload_device, tmp_path):
+    if offload_device == "disk":
+        offload_dir = str(tmp_path / "offload_dir")
+        os.makedirs(offload_dir)
+        return OffloadCache.cls_from_device(offload_device)(
+            onload_device, offload_dir=offload_dir
+        )
+    else:
+        return OffloadCache.cls_from_device(offload_device)(onload_device)
@@ -5,57 +5,51 @@
 from weakref import ref
 
 import torch
-from compressed_tensors.offload import OffloadCache
 from tests.test_offload.conftest import assert_device_equal, assert_tensor_equal
 
 
-def _test_onloading(offload_device, onload_device):
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
+def _test_onloading(offload_device, onload_device, offload_cache):
     tensor = torch.ones(10)
-    cache["weight"] = tensor
-    onloaded = cache["weight"]
+    offload_cache["weight"] = tensor
+    onloaded = offload_cache["weight"]
 
     assert type(onloaded) is type(tensor)
     assert_tensor_equal(onloaded, tensor, onload_device)
 
 
-def _test_garbage_collect(offload_device, onload_device):
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
-    cache["weight"] = torch.ones(10)
-    onloaded = cache["weight"]
+def _test_garbage_collect(offload_device, onload_device, offload_cache):
+    offload_cache["weight"] = torch.ones(10)
+    onloaded = offload_cache["weight"]
 
     onloaded_ref = ref(onloaded)
     del onloaded
     gc.collect()
     assert onloaded_ref() is None
 
 
-def _test_offload(offload_device, onload_device):
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
+def _test_offload(offload_device, onload_device, offload_cache):
     tensor = torch.ones(10, device=onload_device)
-    offloaded = cache.offload(tensor)
+    offloaded = offload_cache.offload(tensor)
     assert_device_equal(offloaded.device, offload_device)
     assert_tensor_equal(offloaded, tensor, offload_device)
 
 
-def _test_onload(offload_device, onload_device):
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
+def _test_onload(offload_device, onload_device, offload_cache):
     tensor = torch.ones(10, device=onload_device)
-    onloaded = cache.onload(cache.offload(tensor))
+    onloaded = offload_cache.onload(offload_cache.offload(tensor))
     assert_device_equal(onloaded.device, onload_device)
     assert_tensor_equal(onloaded, tensor, onload_device)
 
 
-def _test_disable_offloading(offload_device, onload_device):
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
-    cache["weight"] = torch.ones(10)
+def _test_disable_offloading(offload_device, onload_device, offload_cache):
+    offload_cache["weight"] = torch.ones(10)
 
-    outside_onloaded = cache["weight"]
+    outside_onloaded = offload_cache["weight"]
     outside_onloaded_ref = ref(outside_onloaded)
     assert_device_equal(outside_onloaded.device, onload_device)
 
-    with cache.disable_offloading():
-        inside_onloaded = cache["weight"]
+    with offload_cache.disable_offloading():
+        inside_onloaded = offload_cache["weight"]
         inside_onloaded_ref = ref(inside_onloaded)
         assert_device_equal(inside_onloaded.device, onload_device)
 
@@ -70,26 +64,24 @@ def _test_disable_offloading(offload_device, onload_device):
     assert inside_onloaded_ref() is None
 
 
-def _test_disable_onloading(offload_device, onload_device):
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
+def _test_disable_onloading(offload_device, onload_device, offload_cache):
     tensor = torch.ones(10)
-    cache.offloaded_values["weight"] = tensor
+    offload_cache.offloaded_values["weight"] = tensor
 
-    with cache.disable_onloading():
-        onloaded = cache["weight"]
+    with offload_cache.disable_onloading():
+        onloaded = offload_cache["weight"]
         assert onloaded is tensor
 
     assert onloaded is tensor
 
 
-def _test_delete(offload_device, onload_device):
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
-    cache["weight"] = torch.ones(10)
-    onloaded = cache["weight"]
+def _test_delete(offload_device, onload_device, offload_cache):
+    offload_cache["weight"] = torch.ones(10)
+    onloaded = offload_cache["weight"]
     onloaded_ref = ref(onloaded)
 
-    with cache.disable_offloading():
-        del cache["weight"]
+    with offload_cache.disable_offloading():
+        del offload_cache["weight"]
         del onloaded
         gc.collect()
 
@@ -98,66 +90,69 @@ def _test_delete(offload_device, onload_device):
     assert onloaded_ref() is None
 
 
-def _test_shared_attributes(offload_device, onload_device):
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
-    assert cache.offloading_disabled is cache.__class__.offloading_disabled
-    assert cache.onloading_disabled is cache.__class__.onloading_disabled
-    assert cache.keep_onloaded_values is cache.__class__.keep_onloaded_values
+def _test_shared_attributes(offload_device, onload_device, offload_cache):
+    assert (
+        offload_cache.offloading_disabled is offload_cache.__class__.offloading_disabled
+    )
+    assert (
+        offload_cache.onloading_disabled is offload_cache.__class__.onloading_disabled
+    )
+    assert (
+        offload_cache.keep_onloaded_values
+        is offload_cache.__class__.keep_onloaded_values
+    )
 
-    assert not hasattr(cache.__class__, "onload_device")
-    assert not hasattr(cache.__class__, "offloaded_values")
+    assert not hasattr(offload_cache.__class__, "onload_device")
+    assert not hasattr(offload_cache.__class__, "offloaded_values")
 
 
-def _test_tensor_subclass(offload_device, onload_device):
+def _test_tensor_subclass(offload_device, onload_device, offload_cache):
     tensor = torch.ones(10)
     param = torch.nn.Parameter(torch.ones(10), requires_grad=False)
     buffer = torch.nn.Buffer(torch.ones(10))
 
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
-    cache["tensor"] = tensor
-    cache["param"] = param
-    cache["buffer"] = buffer
+    offload_cache["tensor"] = tensor
+    offload_cache["param"] = param
+    offload_cache["buffer"] = buffer
 
-    assert_tensor_equal(cache["tensor"], tensor, onload_device)
-    assert_tensor_equal(cache["param"], param, onload_device)
-    assert_tensor_equal(cache["buffer"], buffer, onload_device)
+    assert_tensor_equal(offload_cache["tensor"], tensor, onload_device)
+    assert_tensor_equal(offload_cache["param"], param, onload_device)
+    assert_tensor_equal(offload_cache["buffer"], buffer, onload_device)
 
-    with cache.disable_onloading():
-        assert_tensor_equal(cache["tensor"], tensor, offload_device)
-        assert_tensor_equal(cache["param"], param, offload_device)
-        assert_tensor_equal(cache["buffer"], buffer, offload_device)
+    with offload_cache.disable_onloading():
+        assert_tensor_equal(offload_cache["tensor"], tensor, offload_device)
+        assert_tensor_equal(offload_cache["param"], param, offload_device)
+        assert_tensor_equal(offload_cache["buffer"], buffer, offload_device)
 
 
-def _test_update_offload(offload_device, onload_device):
-    cache = OffloadCache.cls_from_device(offload_device)(onload_device)
-
+def _test_update_offload(offload_device, onload_device, offload_cache):
     # Create initial tensor and offload it
     initial_data = torch.ones(10, device=onload_device)
-    cache["weight"] = initial_data
+    offload_cache["weight"] = initial_data
 
     # Verify initial value
-    onloaded = cache["weight"]
+    onloaded = offload_cache["weight"]
     assert_tensor_equal(onloaded, initial_data, onload_device)
 
     # Update with new data
     new_data = torch.ones(10, device=onload_device) * 2.0
-    cache["weight"] = new_data
+    offload_cache["weight"] = new_data
 
     # Verify update worked
-    updated_onloaded = cache["weight"]
+    updated_onloaded = offload_cache["weight"]
     assert_tensor_equal(updated_onloaded, new_data, onload_device)
 
     # Verify offloaded tensor was updated in place (not replaced)
-    with cache.disable_onloading():
-        offloaded = cache["weight"]
+    with offload_cache.disable_onloading():
+        offloaded = offload_cache["weight"]
         assert_tensor_equal(offloaded, new_data, offload_device)
 
     # Test update with disable_offloading context
-    with cache.disable_offloading():
-        cache["weight"] = torch.ones(10, device=onload_device) * 3.0
-        cached_onloaded = cache["weight"]
+    with offload_cache.disable_offloading():
+        offload_cache["weight"] = torch.ones(10, device=onload_device) * 3.0
+        cached_onloaded = offload_cache["weight"]
         assert_tensor_equal(cached_onloaded, torch.ones(10) * 3.0, onload_device)
 
     # Verify update persisted after context exit
-    final_onloaded = cache["weight"]
+    final_onloaded = offload_cache["weight"]
     assert_tensor_equal(final_onloaded, torch.ones(10) * 3.0, onload_device)
@@ -13,71 +13,70 @@
     _test_onloading,
     _test_shared_attributes,
     _test_tensor_subclass,
-    _test_update_offload,
 )
 from tests.testing_utils import requires_gpu
 
 
-ONLOAD_DEVICE = torch.device("cuda")
-OFFLOAD_DEVICE = torch.device("cpu")
+@pytest.fixture()
+def onload_device():
+    return torch.device("cuda")
 
 
-@pytest.mark.unit
-@requires_gpu
-def test_delete():
-    _test_delete(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+@pytest.fixture()
+def offload_device():
+    return torch.device("cpu")
 
 
 @pytest.mark.unit
 @requires_gpu
-def test_disable_offloading():
-    _test_disable_offloading(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+def test_delete(offload_device, onload_device, offload_cache):
+    _test_delete(offload_device, onload_device, offload_cache)
 
 
 @pytest.mark.unit
 @requires_gpu
-def test_disable_onloading():
-    _test_disable_onloading(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+def test_disable_offloading(offload_device, onload_device, offload_cache):
+    _test_disable_offloading(offload_device, onload_device, offload_cache)
 
 
 @pytest.mark.unit
 @requires_gpu
-def test_garbage_collect():
-    _test_garbage_collect(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+def test_disable_onloading(offload_device, onload_device, offload_cache):
+    _test_disable_onloading(offload_device, onload_device, offload_cache)
 
 
 @pytest.mark.unit
 @requires_gpu
-def test_offload():
-    _test_offload(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+def test_garbage_collect(offload_device, onload_device, offload_cache):
+    _test_garbage_collect(offload_device, onload_device, offload_cache)
 
 
 @pytest.mark.unit
 @requires_gpu
-@requires_gpu
-def test_onload():
-    _test_onload(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+def test_offload(offload_device, onload_device, offload_cache):
+    _test_offload(offload_device, onload_device, offload_cache)
 
 
 @pytest.mark.unit
 @requires_gpu
-def test_onloading():
-    _test_onloading(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+@requires_gpu
+def test_onload(offload_device, onload_device, offload_cache):
+    _test_onload(offload_device, onload_device, offload_cache)
 
 
 @pytest.mark.unit
 @requires_gpu
-def test_shared_attributes():
-    _test_shared_attributes(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+def test_onloading(offload_device, onload_device, offload_cache):
+    _test_onloading(offload_device, onload_device, offload_cache)
 
 
 @pytest.mark.unit
 @requires_gpu
-def test_tensor_subclass():
-    _test_tensor_subclass(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+def test_shared_attributes(offload_device, onload_device, offload_cache):
+    _test_shared_attributes(offload_device, onload_device, offload_cache)
 
 
 @pytest.mark.unit
 @requires_gpu
-def test_update_offload():
-    _test_update_offload(OFFLOAD_DEVICE, ONLOAD_DEVICE)
+def test_tensor_subclass(offload_device, onload_device, offload_cache):
+    _test_tensor_subclass(offload_device, onload_device, offload_cache)