Remove LiteLLM caching from LM (#8742)

okhat · web-flow · commit 61619c16104f · 2025-08-31T11:08:24.000-04:00
* avoid attribute error in cache get

* Update cache.py
diff --git a/dspy/clients/__init__.py b/dspy/clients/__init__.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 
 import litellm
-from litellm.caching.caching import Cache as LitellmCache
 
 from dspy.clients.base_lm import BaseLM, inspect_history
 from dspy.clients.cache import Cache
@@ -15,23 +14,12 @@
 
 DISK_CACHE_DIR = os.environ.get("DSPY_CACHEDIR") or os.path.join(Path.home(), ".dspy_cache")
 DISK_CACHE_LIMIT = int(os.environ.get("DSPY_CACHE_LIMIT", 3e10))  # 30 GB default
-
-
-def _litellm_track_cache_hit_callback(kwargs, completion_response, start_time, end_time):
-    # Access the cache_hit information
-    completion_response.cache_hit = kwargs.get("cache_hit", False)
-
-
-litellm.success_callback = [_litellm_track_cache_hit_callback]
-
-
 def configure_cache(
     enable_disk_cache: bool | None = True,
     enable_memory_cache: bool | None = True,
     disk_cache_dir: str | None = DISK_CACHE_DIR,
     disk_size_limit_bytes: int | None = DISK_CACHE_LIMIT,
     memory_max_entries: int | None = 1000000,
-    enable_litellm_cache: bool = False,
 ):
     """Configure the cache for DSPy.
 
@@ -41,27 +29,7 @@ def configure_cache(
         disk_cache_dir: The directory to store the on-disk cache.
         disk_size_limit_bytes: The size limit of the on-disk cache.
         memory_max_entries: The maximum number of entries in the in-memory cache.
-        enable_litellm_cache: Whether to enable LiteLLM cache.
     """
-    if enable_disk_cache and enable_litellm_cache:
-        raise ValueError(
-            "Cannot enable both LiteLLM and DSPy on-disk cache, please set at most one of `enable_disk_cache` or "
-            "`enable_litellm_cache` to True."
-        )
-
-    if enable_litellm_cache:
-        try:
-            litellm.cache = LitellmCache(disk_cache_dir=DISK_CACHE_DIR, type="disk")
-
-            if litellm.cache.cache.disk_cache.size_limit != DISK_CACHE_LIMIT:
-                litellm.cache.cache.disk_cache.reset("size_limit", DISK_CACHE_LIMIT)
-        except Exception as e:
-            # It's possible that users don't have the write permissions to the cache directory.
-            # In that case, we'll just disable the cache.
-            logger.warning("Failed to initialize LiteLLM cache: %s", e)
-            litellm.cache = None
-    else:
-        litellm.cache = None
 
     import dspy
 
@@ -75,7 +43,7 @@ def configure_cache(
 
 
 litellm.telemetry = False
-litellm.cache = None  # By default we disable litellm cache and use DSPy on-disk cache.
+litellm.cache = None  # By default we disable LiteLLM cache and use DSPy on-disk cache.
 
 DSPY_CACHE = Cache(
     enable_disk_cache=True,
diff --git a/dspy/clients/cache.py b/dspy/clients/cache.py
@@ -118,6 +118,7 @@ def get(self, request: dict[str, Any], ignored_args_for_cache_key: list[str] | N
         if hasattr(response, "usage"):
             # Clear the usage data when cache is hit, because no LM call is made
             response.usage = {}
+            response.cache_hit = True
         return response
 
     def put(
diff --git a/dspy/clients/lm.py b/dspy/clients/lm.py
@@ -33,7 +33,6 @@ def __init__(
         temperature: float = 0.0,
         max_tokens: int = 4000,
         cache: bool = True,
-        cache_in_memory: bool = True,
         callbacks: list[BaseCallback] | None = None,
         num_retries: int = 3,
         provider: Provider | None = None,
@@ -53,7 +52,6 @@ def __init__(
             max_tokens: The maximum number of tokens to generate per response.
             cache: Whether to cache the model responses for reuse to improve performance
                    and reduce costs.
-            cache_in_memory (deprecated): To enable additional caching with LRU in memory.
             callbacks: A list of callback functions to run before and after each request.
             num_retries: The number of times to retry a request if it fails transiently due to
                          network error, rate limiting, etc. Requests are retried with exponential
@@ -66,7 +64,6 @@ def __init__(
         self.model = model
         self.model_type = model_type
         self.cache = cache
-        self.cache_in_memory = cache_in_memory
         self.provider = provider or self.infer_provider()
         self.callbacks = callbacks or []
         self.history = []
@@ -91,33 +88,21 @@ def __init__(
         else:
             self.kwargs = dict(temperature=temperature, max_tokens=max_tokens, **kwargs)
 
-    def _get_cached_completion_fn(self, completion_fn, cache, enable_memory_cache):
+    def _get_cached_completion_fn(self, completion_fn, cache):
         ignored_args_for_cache_key = ["api_key", "api_base", "base_url"]
-        if cache and enable_memory_cache:
+        if cache:
             completion_fn = request_cache(
                 cache_arg_name="request",
                 ignored_args_for_cache_key=ignored_args_for_cache_key,
             )(completion_fn)
-        elif cache:
-            completion_fn = request_cache(
-                cache_arg_name="request",
-                ignored_args_for_cache_key=ignored_args_for_cache_key,
-                enable_memory_cache=False,
-            )(completion_fn)
-        else:
-            completion_fn = completion_fn
 
-        if not cache or litellm.cache is None:
-            litellm_cache_args = {"no-cache": True, "no-store": True}
-        else:
-            litellm_cache_args = {"no-cache": False, "no-store": False}
+        litellm_cache_args = {"no-cache": True, "no-store": True}
 
         return completion_fn, litellm_cache_args
 
     def forward(self, prompt=None, messages=None, **kwargs):
         # Build the request.
         cache = kwargs.pop("cache", self.cache)
-        enable_memory_cache = kwargs.pop("cache_in_memory", self.cache_in_memory)
 
         messages = messages or [{"role": "user", "content": prompt}]
         kwargs = {**self.kwargs, **kwargs}
@@ -128,7 +113,7 @@ def forward(self, prompt=None, messages=None, **kwargs):
             completion = litellm_text_completion
         elif self.model_type == "responses":
             completion = litellm_responses_completion
-        completion, litellm_cache_args = self._get_cached_completion_fn(completion, cache, enable_memory_cache)
+        completion, litellm_cache_args = self._get_cached_completion_fn(completion, cache)
 
         results = completion(
             request=dict(model=self.model, messages=messages, **kwargs),
@@ -145,7 +130,6 @@ def forward(self, prompt=None, messages=None, **kwargs):
     async def aforward(self, prompt=None, messages=None, **kwargs):
         # Build the request.
         cache = kwargs.pop("cache", self.cache)
-        enable_memory_cache = kwargs.pop("cache_in_memory", self.cache_in_memory)
 
         messages = messages or [{"role": "user", "content": prompt}]
         kwargs = {**self.kwargs, **kwargs}
@@ -156,7 +140,7 @@ async def aforward(self, prompt=None, messages=None, **kwargs):
             completion = alitellm_text_completion
         elif self.model_type == "responses":
             completion = alitellm_responses_completion
-        completion, litellm_cache_args = self._get_cached_completion_fn(completion, cache, enable_memory_cache)
+        completion, litellm_cache_args = self._get_cached_completion_fn(completion, cache)
 
         results = await completion(
             request=dict(model=self.model, messages=messages, **kwargs),
@@ -246,7 +230,6 @@ def dump_state(self):
             "model",
             "model_type",
             "cache",
-            "cache_in_memory",
             "num_retries",
             "finetuning_model",
             "launch_kwargs",
diff --git a/tests/clients/test_litellm_cache.py b/tests/clients/test_litellm_cache.py
diff --git a/tests/clients/test_lm.py b/tests/clients/test_lm.py
@@ -62,48 +62,13 @@ def test_chat_lms_can_be_queried(litellm_test_server):
     assert azure_openai_lm("azure openai query") == expected_response
 
 
-@pytest.mark.parametrize(
-    ("cache", "cache_in_memory"),
-    [
-        (True, True),
-        (True, False),
-        (False, True),
-        (False, False),
-    ],
-)
-def test_litellm_cache(litellm_test_server, cache, cache_in_memory):
-    api_base, _ = litellm_test_server
-    expected_response = ["Hi!"]
-
-    original_cache = dspy.cache
-    dspy.clients.configure_cache(
-        enable_disk_cache=False,
-        enable_memory_cache=cache_in_memory,
-        enable_litellm_cache=cache,
-    )
-
-    openai_lm = dspy.LM(
-        model="openai/dspy-test-model",
-        api_base=api_base,
-        api_key="fakekey",
-        model_type="chat",
-        cache=cache,
-        cache_in_memory=cache_in_memory,
-    )
-    assert openai_lm("openai query") == expected_response
-
-    # Reset the cache configuration
-    dspy.cache = original_cache
-
-
 def test_dspy_cache(litellm_test_server, tmp_path):
     api_base, _ = litellm_test_server
 
     original_cache = dspy.cache
     dspy.clients.configure_cache(
         enable_disk_cache=True,
         enable_memory_cache=True,
-        enable_litellm_cache=False,
         disk_cache_dir=tmp_path / ".disk_cache",
     )
     cache = dspy.cache
@@ -288,7 +253,6 @@ def test_dump_state():
         "max_tokens": 100,
         "num_retries": 10,
         "cache": True,
-        "cache_in_memory": True,
         "finetuning_model": None,
         "launch_kwargs": {"temperature": 1},
         "train_kwargs": {"temperature": 5},
@@ -377,7 +341,6 @@ async def test_async_lm_call_with_cache(tmp_path):
     dspy.clients.configure_cache(
         enable_disk_cache=True,
         enable_memory_cache=True,
-        enable_litellm_cache=False,
         disk_cache_dir=tmp_path / ".disk_cache",
     )
     cache = dspy.cache
@@ -400,11 +363,10 @@ async def test_async_lm_call_with_cache(tmp_path):
         # Second call should hit the cache, so no new call to LiteLLM is made.
         assert mock_alitellm_completion.call_count == 1
 
-        # Test that explicitly disabling memory cache works
-        await lm.acall("New query", cache_in_memory=False)
+        # A new query should result in a new LiteLLM call and a new cache entry.
+        await lm.acall("New query")
 
-        # There should be a new call to LiteLLM on new query, but the memory cache shouldn't be written to.
-        assert len(cache.memory_cache) == 1
+        assert len(cache.memory_cache) == 2
         assert mock_alitellm_completion.call_count == 2
 
     dspy.cache = original_cache
@@ -470,7 +432,6 @@ def test_responses_api(litellm_test_server):
             api_key="fakekey",
             model_type="responses",
             cache=False,
-            cache_in_memory=False,
         )
         assert lm("openai query") == [expected_text]
 
@@ -501,7 +462,6 @@ def test_responses_api_tool_calls(litellm_test_server):
             api_key="fakekey",
             model_type="responses",
             cache=False,
-            cache_in_memory=False,
         )
         assert lm("openai query") == expected_response
 
diff --git a/tests/predict/test_predict.py b/tests/predict/test_predict.py
@@ -56,7 +56,6 @@ def test_lm_after_dump_and_load_state():
         "max_tokens": 100,
         "num_retries": 10,
         "cache": True,
-        "cache_in_memory": True,
         "finetuning_model": None,
         "launch_kwargs": {},
         "train_kwargs": {},