From ada82d1f4505e21be2ca6fea190653e681f68583 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rtuli@redhat.com>
Date: Fri, 3 Oct 2025 14:23:18 +0000
Subject: [PATCH 1/4] Add in-memory caching for lm-eval base model results

Implements caching mechanism to reduce test times by avoiding redundant
base model evaluations when multiple tests use the same configuration.
Cache is stored in memory during test session and automatically cleared
when the process exits.

Signed-off-by: Rahul Tuli <rtuli@redhat.com>
---
 tests/lmeval/test_lmeval.py |   9 +-
 tests/testing_utils.py      | 183 +++++++++++++++++++++++++++++++++++-
 2 files changed, 189 insertions(+), 3 deletions(-)

diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py
index a44cd042f..e22fea149 100644
--- a/tests/lmeval/test_lmeval.py
+++ b/tests/lmeval/test_lmeval.py
@@ -15,7 +15,7 @@
 from llmcompressor.core import active_session
 from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing
 from tests.test_timer.timer_utils import get_singleton_manager, log_time
-from tests.testing_utils import requires_gpu
+from tests.testing_utils import cached_lm_eval, get_lmeval_cache_stats, requires_gpu
 
 
 class LmEvalConfig(BaseModel):
@@ -110,6 +110,10 @@ def set_up(self, test_data_file: str):
         if self.lmeval.metrics:
             logger.info("Absolute metrics provided - will show warnings if outside ±5%")
 
+        # Log cache stats
+        cache_stats = get_lmeval_cache_stats()
+        logger.info(f"LM-Eval cache stats: {cache_stats}")
+
         self.num_calibration_samples = eval_config.get("num_calibration_samples", 512)
         self.max_seq_length = 2048
 
@@ -149,8 +153,9 @@ def test_lm_eval(self, test_data_file: str):
         self.tear_down()
 
     @log_time
+    @cached_lm_eval
     def _eval_base_model(self):
-        """Evaluate the base (uncompressed) model."""
+        """Evaluate the base (uncompressed) model with caching."""
         model_args = {**self.lmeval.model_args, "pretrained": self.model}
 
         results = lm_eval.simple_evaluate(
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 4ce6a5de6..e7f68fb61 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -1,20 +1,29 @@
 import dataclasses
 import enum
+import hashlib
+import json
 import logging
 import os
 from dataclasses import dataclass
 from enum import Enum
+from functools import wraps
 from pathlib import Path
 from subprocess import PIPE, STDOUT, run
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 
 import pytest
 import torch
 import yaml
 from datasets import Dataset
+from loguru import logger
 from transformers import ProcessorMixin
 
 TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", None)
+DISABLE_LMEVAL_CACHE = os.environ.get("DISABLE_LMEVAL_CACHE", "").lower() in (
+    "1",
+    "true",
+    "yes",
+)
 
 
 # TODO: maybe test type as decorators?
@@ -300,3 +309,175 @@ def requires_cadence(cadence: Union[str, List[str]]) -> Callable:
     return pytest.mark.skipif(
         (current_cadence not in cadence), reason="cadence mismatch"
     )
+
+
+# =============================================================================
+# LM-Eval Base Model Caching
+# =============================================================================
+# In-memory cache for base model lm-eval results within a single test session.
+# This avoids redundant base model evaluations when multiple tests use the same
+# base model configuration in a single pytest run.
+#
+# Usage:
+#     from tests.testing_utils import cached_lm_eval
+#
+#     class TestLMEval:
+#         @cached_lm_eval
+#         def _eval_base_model(self):
+#             results = lm_eval.simple_evaluate(...)
+#             return results
+#
+# Example:
+#     Running 8 tests with the same base model in one pytest session:
+#     - Test 1: Evaluates base model (5 min) → cached
+#     - Tests 2-8: Use cached results (instant) → 35 min saved
+# =============================================================================
+
+# Module-level cache - persists for duration of Python process
+_LMEVAL_CACHE: dict = {}
+
+
+@dataclass(frozen=True)
+class LMEvalCacheKey:
+    """Unique identifier for a base model evaluation.
+
+    This is used as a dict key, so it must be hashable (immutable).
+    frozen=True makes the dataclass immutable and hashable.
+
+    :param model: HuggingFace model identifier
+    :param task: LM-Eval task name
+    :param num_fewshot: Number of few-shot examples
+    :param limit: Maximum number of samples to evaluate
+    :param batch_size: Batch size for evaluation
+    :param model_args_hash: SHA256 hash of model_args dict
+    """
+
+    model: str
+    task: str
+    num_fewshot: int
+    limit: int
+    batch_size: int
+    model_args_hash: str  # Hash of dict (dicts aren't hashable)
+
+    @classmethod
+    def from_test_instance(cls, test_instance: Any) -> "LMEvalCacheKey":
+        """Extract cache key from a TestLMEval instance.
+
+        :param test_instance: Instance with model, lmeval attributes
+        :return: LMEvalCacheKey identifying this evaluation configuration
+        :raises AttributeError: If required attributes are missing
+        """
+        # Hash model_args to make it hashable
+        model_args = test_instance.lmeval.model_args
+        args_str = json.dumps(model_args, sort_keys=True)
+        args_hash = hashlib.sha256(args_str.encode()).hexdigest()
+
+        return cls(
+            model=test_instance.model,
+            task=test_instance.lmeval.task,
+            num_fewshot=test_instance.lmeval.num_fewshot,
+            limit=test_instance.lmeval.limit,
+            batch_size=test_instance.lmeval.batch_size,
+            model_args_hash=args_hash,
+        )
+
+    def __str__(self) -> str:
+        """Human-readable representation for logging."""
+        return (
+            f"{self.model}|{self.task}|"
+            f"fs={self.num_fewshot}|lim={self.limit}|"
+            f"bs={self.batch_size}|args={self.model_args_hash[:8]}"
+        )
+
+
+def get_lmeval_cache_stats() -> dict:
+    """Get current LM-Eval cache statistics.
+
+    :return: Cache metrics including total_entries (int) and keys (list of str)
+    """
+    return {
+        "total_entries": len(_LMEVAL_CACHE),
+        "keys": [str(key) for key in _LMEVAL_CACHE.keys()],
+    }
+
+
+def clear_lmeval_cache() -> None:
+    """Clear all cached LM-Eval entries."""
+    global _LMEVAL_CACHE
+    _LMEVAL_CACHE.clear()
+    logger.info("LM-Eval in-memory cache cleared")
+
+
+def cached_lm_eval(func: Callable) -> Callable:
+    """Decorator to cache lm-eval results for base model evaluations.
+
+    This decorator uses a module-level dict to cache results within the
+    current Python process. The cache is automatically cleared when the
+    process exits.
+
+    Workflow:
+        1. Extract cache key from test instance (self)
+        2. Check if key exists in cache
+        3. If HIT: return cached results (instant)
+        4. If MISS: call original function, cache results
+        5. Handle errors gracefully (fail-safe)
+
+    The decorator can be disabled via environment variable::
+
+        DISABLE_LMEVAL_CACHE=1
+
+    Example::
+
+        @cached_lm_eval
+        def _eval_base_model(self):
+            return lm_eval.simple_evaluate(...)
+
+    :param func: Method to decorate (must be instance method with self)
+    :return: Wrapped function with caching logic
+    """
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        # Check if caching is disabled
+        if DISABLE_LMEVAL_CACHE:
+            logger.info(
+                "LM-Eval cache disabled via DISABLE_LMEVAL_CACHE environment variable"
+            )
+            return func(self, *args, **kwargs)
+
+        # Extract cache key from test instance
+        try:
+            cache_key = LMEvalCacheKey.from_test_instance(self)
+        except (AttributeError, TypeError) as e:
+            # If we can't extract key, run without cache (fail-safe)
+            logger.warning(
+                f"Could not extract LM-Eval cache key: {e.__class__.__name__}: {e}. "
+                f"Running without cache."
+            )
+            return func(self, *args, **kwargs)
+
+        # Check cache
+        if cache_key in _LMEVAL_CACHE:
+            logger.info(f"LM-Eval cache HIT: {cache_key}")
+            logger.info("✓ Using cached base model results")
+            return _LMEVAL_CACHE[cache_key]
+
+        # Cache miss - evaluate base model
+        logger.info(f"LM-Eval cache MISS: {cache_key}")
+        logger.info("Evaluating base model (will be cached for subsequent tests)")
+
+        results = func(self, *args, **kwargs)
+
+        # Store in cache
+        try:
+            _LMEVAL_CACHE[cache_key] = results
+            logger.info(
+                f"LM-Eval cache WRITE: {cache_key} (total entries: {len(_LMEVAL_CACHE)})"
+            )
+        except Exception as e:
+            # Cache write should never fail tests
+            logger.error(f"Failed to cache LM-Eval results: {e.__class__.__name__}: {e}")
+
+        return results
+
+    return wrapper

From dc879179b64bd0ca3ad2d3912e1a0707c0e0b2ba Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rtuli@redhat.com>
Date: Mon, 6 Oct 2025 11:57:56 +0000
Subject: [PATCH 2/4] Simplify logic

---
 tests/lmeval/test_lmeval.py |   8 +-
 tests/testing_utils.py      | 214 +++++++++---------------------------
 2 files changed, 57 insertions(+), 165 deletions(-)

diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py
index e22fea149..deee4117f 100644
--- a/tests/lmeval/test_lmeval.py
+++ b/tests/lmeval/test_lmeval.py
@@ -15,7 +15,7 @@
 from llmcompressor.core import active_session
 from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing
 from tests.test_timer.timer_utils import get_singleton_manager, log_time
-from tests.testing_utils import cached_lm_eval, get_lmeval_cache_stats, requires_gpu
+from tests.testing_utils import cached_lm_eval_run, requires_gpu
 
 
 class LmEvalConfig(BaseModel):
@@ -110,10 +110,6 @@ def set_up(self, test_data_file: str):
         if self.lmeval.metrics:
             logger.info("Absolute metrics provided - will show warnings if outside ±5%")
 
-        # Log cache stats
-        cache_stats = get_lmeval_cache_stats()
-        logger.info(f"LM-Eval cache stats: {cache_stats}")
-
         self.num_calibration_samples = eval_config.get("num_calibration_samples", 512)
         self.max_seq_length = 2048
 
@@ -153,7 +149,7 @@ def test_lm_eval(self, test_data_file: str):
         self.tear_down()
 
     @log_time
-    @cached_lm_eval
+    @cached_lm_eval_run
     def _eval_base_model(self):
         """Evaluate the base (uncompressed) model with caching."""
         model_args = {**self.lmeval.model_args, "pretrained": self.model}
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index e7f68fb61..cb63b4384 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -9,7 +9,7 @@
 from functools import wraps
 from pathlib import Path
 from subprocess import PIPE, STDOUT, run
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, List, NamedTuple, Optional, Union
 
 import pytest
 import torch
@@ -25,6 +25,28 @@
     "yes",
 )
 
+# Module-level cache - persists for duration of Python process
+_LMEVAL_CACHE: dict = {}
+
+
+class LMEvalCacheKey(NamedTuple):
+    """Hashable cache key for a base model evaluation.
+
+    :param model: HuggingFace model identifier
+    :param task: LM-Eval task name
+    :param num_fewshot: Number of few-shot examples
+    :param limit: Maximum number of samples to evaluate
+    :param batch_size: Batch size for evaluation
+    :param model_args_hash: SHA256 hash of model_args dict
+    """
+
+    model: str
+    task: str
+    num_fewshot: int
+    limit: int
+    batch_size: int
+    model_args_hash: str
+
 
 # TODO: maybe test type as decorators?
 class TestType(Enum):
@@ -311,173 +333,47 @@ def requires_cadence(cadence: Union[str, List[str]]) -> Callable:
     )
 
 
-# =============================================================================
-# LM-Eval Base Model Caching
-# =============================================================================
-# In-memory cache for base model lm-eval results within a single test session.
-# This avoids redundant base model evaluations when multiple tests use the same
-# base model configuration in a single pytest run.
-#
-# Usage:
-#     from tests.testing_utils import cached_lm_eval
-#
-#     class TestLMEval:
-#         @cached_lm_eval
-#         def _eval_base_model(self):
-#             results = lm_eval.simple_evaluate(...)
-#             return results
-#
-# Example:
-#     Running 8 tests with the same base model in one pytest session:
-#     - Test 1: Evaluates base model (5 min) → cached
-#     - Tests 2-8: Use cached results (instant) → 35 min saved
-# =============================================================================
-
-# Module-level cache - persists for duration of Python process
-_LMEVAL_CACHE: dict = {}
-
-
-@dataclass(frozen=True)
-class LMEvalCacheKey:
-    """Unique identifier for a base model evaluation.
-
-    This is used as a dict key, so it must be hashable (immutable).
-    frozen=True makes the dataclass immutable and hashable.
-
-    :param model: HuggingFace model identifier
-    :param task: LM-Eval task name
-    :param num_fewshot: Number of few-shot examples
-    :param limit: Maximum number of samples to evaluate
-    :param batch_size: Batch size for evaluation
-    :param model_args_hash: SHA256 hash of model_args dict
-    """
+def _make_lmeval_cache_key(test_instance: Any) -> LMEvalCacheKey:
+    """Create a hashable cache key from a TestLMEval instance.
 
-    model: str
-    task: str
-    num_fewshot: int
-    limit: int
-    batch_size: int
-    model_args_hash: str  # Hash of dict (dicts aren't hashable)
-
-    @classmethod
-    def from_test_instance(cls, test_instance: Any) -> "LMEvalCacheKey":
-        """Extract cache key from a TestLMEval instance.
-
-        :param test_instance: Instance with model, lmeval attributes
-        :return: LMEvalCacheKey identifying this evaluation configuration
-        :raises AttributeError: If required attributes are missing
-        """
-        # Hash model_args to make it hashable
-        model_args = test_instance.lmeval.model_args
-        args_str = json.dumps(model_args, sort_keys=True)
-        args_hash = hashlib.sha256(args_str.encode()).hexdigest()
-
-        return cls(
-            model=test_instance.model,
-            task=test_instance.lmeval.task,
-            num_fewshot=test_instance.lmeval.num_fewshot,
-            limit=test_instance.lmeval.limit,
-            batch_size=test_instance.lmeval.batch_size,
-            model_args_hash=args_hash,
-        )
-
-    def __str__(self) -> str:
-        """Human-readable representation for logging."""
-        return (
-            f"{self.model}|{self.task}|"
-            f"fs={self.num_fewshot}|lim={self.limit}|"
-            f"bs={self.batch_size}|args={self.model_args_hash[:8]}"
-        )
-
-
-def get_lmeval_cache_stats() -> dict:
-    """Get current LM-Eval cache statistics.
-
-    :return: Cache metrics including total_entries (int) and keys (list of str)
+    :param test_instance: Instance with model, lmeval attributes
+    :return: LMEvalCacheKey for this evaluation configuration
+    :raises AttributeError: If required attributes are missing
     """
-    return {
-        "total_entries": len(_LMEVAL_CACHE),
-        "keys": [str(key) for key in _LMEVAL_CACHE.keys()],
-    }
-
-
-def clear_lmeval_cache() -> None:
-    """Clear all cached LM-Eval entries."""
-    global _LMEVAL_CACHE
-    _LMEVAL_CACHE.clear()
-    logger.info("LM-Eval in-memory cache cleared")
-
-
-def cached_lm_eval(func: Callable) -> Callable:
-    """Decorator to cache lm-eval results for base model evaluations.
-
-    This decorator uses a module-level dict to cache results within the
-    current Python process. The cache is automatically cleared when the
-    process exits.
-
-    Workflow:
-        1. Extract cache key from test instance (self)
-        2. Check if key exists in cache
-        3. If HIT: return cached results (instant)
-        4. If MISS: call original function, cache results
-        5. Handle errors gracefully (fail-safe)
-
-    The decorator can be disabled via environment variable::
-
-        DISABLE_LMEVAL_CACHE=1
-
-    Example::
+    model_args = test_instance.lmeval.model_args
+    args_str = json.dumps(model_args, sort_keys=True)
+    args_hash = hashlib.sha256(args_str.encode()).hexdigest()
+
+    return LMEvalCacheKey(
+        model=test_instance.model,
+        task=test_instance.lmeval.task,
+        num_fewshot=test_instance.lmeval.num_fewshot,
+        limit=test_instance.lmeval.limit,
+        batch_size=test_instance.lmeval.batch_size,
+        model_args_hash=args_hash,
+    )
 
-        @cached_lm_eval
-        def _eval_base_model(self):
-            return lm_eval.simple_evaluate(...)
 
-    :param func: Method to decorate (must be instance method with self)
-    :return: Wrapped function with caching logic
-    """
 
+def cached_lm_eval_run(func: Callable) -> Callable:
+    """Decorator that caches LM-Eval results for instance methods, with optional disabling."""
     @wraps(func)
-    def wrapper(self, *args, **kwargs):
-        # Check if caching is disabled
+    def cached(self, *args, **kwargs):
         if DISABLE_LMEVAL_CACHE:
-            logger.info(
-                "LM-Eval cache disabled via DISABLE_LMEVAL_CACHE environment variable"
-            )
-            return func(self, *args, **kwargs)
-
-        # Extract cache key from test instance
-        try:
-            cache_key = LMEvalCacheKey.from_test_instance(self)
-        except (AttributeError, TypeError) as e:
-            # If we can't extract key, run without cache (fail-safe)
-            logger.warning(
-                f"Could not extract LM-Eval cache key: {e.__class__.__name__}: {e}. "
-                f"Running without cache."
-            )
+            logger.info("LM-Eval cache disabled via DISABLE_LMEVAL_CACHE")
             return func(self, *args, **kwargs)
 
-        # Check cache
-        if cache_key in _LMEVAL_CACHE:
-            logger.info(f"LM-Eval cache HIT: {cache_key}")
-            logger.info("✓ Using cached base model results")
-            return _LMEVAL_CACHE[cache_key]
+        key = _make_lmeval_cache_key(self)
+        cached_result = _LMEVAL_CACHE.get(key)
+        if cached_result is not None:
+            logger.info(f"LM-Eval cache HIT: {key}")
+            return cached_result
 
-        # Cache miss - evaluate base model
-        logger.info(f"LM-Eval cache MISS: {cache_key}")
-        logger.info("Evaluating base model (will be cached for subsequent tests)")
-
-        results = func(self, *args, **kwargs)
-
-        # Store in cache
-        try:
-            _LMEVAL_CACHE[cache_key] = results
-            logger.info(
-                f"LM-Eval cache WRITE: {cache_key} (total entries: {len(_LMEVAL_CACHE)})"
-            )
-        except Exception as e:
-            # Cache write should never fail tests
-            logger.error(f"Failed to cache LM-Eval results: {e.__class__.__name__}: {e}")
+        logger.info(f"LM-Eval cache MISS: {key}")
+        result = func(self, *args, **kwargs)
+        _LMEVAL_CACHE[key] = result
+        logger.info(f"LM-Eval cache WRITE: {key} ({len(_LMEVAL_CACHE)} entries)")
+        return result
 
-        return results
+    return cached
 
-    return wrapper

From b57c9ff680db2b40423352b5d1a3cb63665b5eeb Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rtuli@redhat.com>
Date: Mon, 6 Oct 2025 12:38:29 +0000
Subject: [PATCH 3/4] Add: Persisting Cache

Signed-off-by: Rahul Tuli <rtuli@redhat.com>
---
 tests/testing_utils.py | 100 ++++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 47 deletions(-)

diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index cb63b4384..34702d39d 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -4,12 +4,13 @@
 import json
 import logging
 import os
+from contextlib import suppress
 from dataclasses import dataclass
 from enum import Enum
 from functools import wraps
 from pathlib import Path
 from subprocess import PIPE, STDOUT, run
-from typing import Any, Callable, List, NamedTuple, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 
 import pytest
 import torch
@@ -24,22 +25,16 @@
     "true",
     "yes",
 )
+LMEVAL_CACHE_DIR = Path(os.environ.get("LMEVAL_CACHE_DIR", ".lmeval_cache"))
 
-# Module-level cache - persists for duration of Python process
-_LMEVAL_CACHE: dict = {}
 
+def _sha256_hash(text: str, length: Optional[int] = None) -> str:
+    hash_result = hashlib.sha256(text.encode()).hexdigest()
+    return hash_result[:length] if length else hash_result
 
-class LMEvalCacheKey(NamedTuple):
-    """Hashable cache key for a base model evaluation.
-
-    :param model: HuggingFace model identifier
-    :param task: LM-Eval task name
-    :param num_fewshot: Number of few-shot examples
-    :param limit: Maximum number of samples to evaluate
-    :param batch_size: Batch size for evaluation
-    :param model_args_hash: SHA256 hash of model_args dict
-    """
 
+@dataclass(frozen=True)
+class LMEvalCacheKey:
     model: str
     task: str
     num_fewshot: int
@@ -47,6 +42,42 @@ class LMEvalCacheKey(NamedTuple):
     batch_size: int
     model_args_hash: str
 
+    @classmethod
+    def from_test_instance(cls, test_instance: Any) -> "LMEvalCacheKey":
+        lmeval = test_instance.lmeval
+        model_args_json = json.dumps(lmeval.model_args, sort_keys=True)
+
+        return cls(
+            model=test_instance.model,
+            task=lmeval.task,
+            num_fewshot=lmeval.num_fewshot,
+            limit=lmeval.limit,
+            batch_size=lmeval.batch_size,
+            model_args_hash=_sha256_hash(model_args_json),
+        )
+
+    @property
+    def cache_filepath(self) -> Path:
+        key_components = (
+            f"{self.model}_{self.task}_{self.num_fewshot}_"
+            f"{self.limit}_{self.batch_size}_{self.model_args_hash}"
+        )
+        return LMEVAL_CACHE_DIR / f"{_sha256_hash(key_components, 16)}.json"
+
+    def get_cached_result(self) -> Optional[dict]:
+        if not self.cache_filepath.exists():
+            return
+
+        with suppress(Exception):
+            with open(self.cache_filepath) as f:
+                return json.load(f)
+
+    def store_result(self, result: dict) -> None:
+        with suppress(Exception):
+            self.cache_filepath.parent.mkdir(parents=True, exist_ok=True)
+            with open(self.cache_filepath, "w") as f:
+                json.dump(result, f)
+
 
 # TODO: maybe test type as decorators?
 class TestType(Enum):
@@ -333,47 +364,22 @@ def requires_cadence(cadence: Union[str, List[str]]) -> Callable:
     )
 
 
-def _make_lmeval_cache_key(test_instance: Any) -> LMEvalCacheKey:
-    """Create a hashable cache key from a TestLMEval instance.
-
-    :param test_instance: Instance with model, lmeval attributes
-    :return: LMEvalCacheKey for this evaluation configuration
-    :raises AttributeError: If required attributes are missing
-    """
-    model_args = test_instance.lmeval.model_args
-    args_str = json.dumps(model_args, sort_keys=True)
-    args_hash = hashlib.sha256(args_str.encode()).hexdigest()
-
-    return LMEvalCacheKey(
-        model=test_instance.model,
-        task=test_instance.lmeval.task,
-        num_fewshot=test_instance.lmeval.num_fewshot,
-        limit=test_instance.lmeval.limit,
-        batch_size=test_instance.lmeval.batch_size,
-        model_args_hash=args_hash,
-    )
-
-
-
 def cached_lm_eval_run(func: Callable) -> Callable:
-    """Decorator that caches LM-Eval results for instance methods, with optional disabling."""
     @wraps(func)
-    def cached(self, *args, **kwargs):
+    def wrapper(self, *args, **kwargs):
         if DISABLE_LMEVAL_CACHE:
-            logger.info("LM-Eval cache disabled via DISABLE_LMEVAL_CACHE")
+            logger.info("LM-Eval cache disabled")
             return func(self, *args, **kwargs)
 
-        key = _make_lmeval_cache_key(self)
-        cached_result = _LMEVAL_CACHE.get(key)
-        if cached_result is not None:
-            logger.info(f"LM-Eval cache HIT: {key}")
+        cache_key = LMEvalCacheKey.from_test_instance(self)
+
+        if (cached_result := cache_key.get_cached_result()) is not None:
+            logger.info(f"LM-Eval cache HIT: {cache_key}")
             return cached_result
 
-        logger.info(f"LM-Eval cache MISS: {key}")
+        logger.info(f"LM-Eval cache MISS: {cache_key}")
         result = func(self, *args, **kwargs)
-        _LMEVAL_CACHE[key] = result
-        logger.info(f"LM-Eval cache WRITE: {key} ({len(_LMEVAL_CACHE)} entries)")
+        cache_key.store_result(result)
         return result
 
-    return cached
-
+    return wrapper

From 6832230f5a0e37b6e5185902fb6b3b97a90f381a Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rtuli@redhat.com>
Date: Mon, 6 Oct 2025 16:08:01 +0000
Subject: [PATCH 4/4] Improve: Logging w.r.t the cache

Signed-off-by: Rahul Tuli <rtuli@redhat.com>
---
 tests/testing_utils.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 34702d39d..001ff96c2 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -4,7 +4,6 @@
 import json
 import logging
 import os
-from contextlib import suppress
 from dataclasses import dataclass
 from enum import Enum
 from functools import wraps
@@ -68,15 +67,21 @@ def get_cached_result(self) -> Optional[dict]:
         if not self.cache_filepath.exists():
             return
 
-        with suppress(Exception):
+        try:
             with open(self.cache_filepath) as f:
                 return json.load(f)
+        except Exception as e:
+            logger.warning(f"Failed to load cache from {self.cache_filepath}: {e}")
+            return
 
     def store_result(self, result: dict) -> None:
-        with suppress(Exception):
+        try:
             self.cache_filepath.parent.mkdir(parents=True, exist_ok=True)
             with open(self.cache_filepath, "w") as f:
-                json.dump(result, f)
+                json.dump(result, f, default=str)
+            logger.info(f"LM-Eval cache WRITE: {self.cache_filepath}")
+        except Exception as e:
+            logger.warning(f"Failed to save cache to {self.cache_filepath}: {e}")
 
 
 # TODO: maybe test type as decorators?
@@ -374,10 +379,10 @@ def wrapper(self, *args, **kwargs):
         cache_key = LMEvalCacheKey.from_test_instance(self)
 
         if (cached_result := cache_key.get_cached_result()) is not None:
-            logger.info(f"LM-Eval cache HIT: {cache_key}")
+            logger.info(f"LM-Eval cache HIT: {cache_key.cache_filepath}")
             return cached_result
 
-        logger.info(f"LM-Eval cache MISS: {cache_key}")
+        logger.info(f"LM-Eval cache MISS: {cache_key.cache_filepath}")
         result = func(self, *args, **kwargs)
         cache_key.store_result(result)
         return result