Merge branch 'main' into z-image_metadata_node

Pfannkuchensack · web-flow · commit 384a1a689dff · 2026-01-05T01:50:28.000+01:00
diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
@@ -85,6 +85,7 @@ class InvokeAIAppConfig(BaseSettings):
         max_cache_ram_gb: The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.
         max_cache_vram_gb: The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.
         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
+        model_cache_keep_alive_min: How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.
         device_working_mem_gb: The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.
         enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.
         keep_ram_copy_of_weights: Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.
@@ -165,9 +166,10 @@ class InvokeAIAppConfig(BaseSettings):
     max_cache_ram_gb:   Optional[float] = Field(default=None, gt=0,         description="The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.")
     max_cache_vram_gb:  Optional[float] = Field(default=None, ge=0,         description="The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.")
     log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
+    model_cache_keep_alive_min:   float = Field(default=0, ge=0,            description="How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.")
     device_working_mem_gb:        float = Field(default=3,                  description="The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.")
     enable_partial_loading:        bool = Field(default=False,              description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.")
-    keep_ram_copy_of_weights:      bool = Field(default=True,              description="Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.")
+    keep_ram_copy_of_weights:      bool = Field(default=True,               description="Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.")
     # Deprecated CACHE configs
     ram:                Optional[float] = Field(default=None, gt=0,         description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
     vram:               Optional[float] = Field(default=None, ge=0,         description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
@@ -60,6 +60,10 @@ def start(self, invoker: Invoker) -> None:
                 service.start(invoker)
 
     def stop(self, invoker: Invoker) -> None:
+        # Shutdown the model cache to cancel any pending timers
+        if hasattr(self._load, "ram_cache"):
+            self._load.ram_cache.shutdown()
+
         for service in [self._store, self._install, self._load]:
             if hasattr(service, "stop"):
                 service.stop(invoker)
@@ -88,7 +92,10 @@ def build_model_manager(
             max_ram_cache_size_gb=app_config.max_cache_ram_gb,
             max_vram_cache_size_gb=app_config.max_cache_vram_gb,
             execution_device=execution_device or TorchDevice.choose_torch_device(),
+            storage_device="cpu",
+            log_memory_usage=app_config.log_memory_usage,
             logger=logger,
+            keep_alive_minutes=app_config.model_cache_keep_alive_min,
         )
         loader = ModelLoadService(
             app_config=app_config,
diff --git a/invokeai/backend/model_manager/load/load_default.py b/invokeai/backend/model_manager/load/load_default.py
@@ -75,7 +75,6 @@ def _load_and_cache(self, config: AnyModelConfig, submodel_type: Optional[SubMod
 
         config.path = str(self._get_model_path(config))
         self._ram_cache.make_room(self.get_size_fs(config, Path(config.path), submodel_type))
-        self._logger.info(f"Loading model '{stats_name}' into RAM cache..., config={config}")
         loaded_model = self._load_model(config, submodel_type)
 
         self._ram_cache.put(
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@@ -55,6 +55,21 @@ def wrapper(self, *args, **kwargs):
     return wrapper
 
 
+def record_activity(method: Callable[..., Any]) -> Callable[..., Any]:
+    """A decorator that records activity after a method completes successfully.
+
+    Note: This decorator should be applied to methods that already hold self._lock.
+    """
+
+    @wraps(method)
+    def wrapper(self, *args, **kwargs):
+        result = method(self, *args, **kwargs)
+        self._record_activity()
+        return result
+
+    return wrapper
+
+
 @dataclass
 class CacheEntrySnapshot:
     cache_key: str
@@ -132,6 +147,7 @@ def __init__(
         storage_device: torch.device | str = "cpu",
         log_memory_usage: bool = False,
         logger: Optional[Logger] = None,
+        keep_alive_minutes: float = 0,
     ):
         """Initialize the model RAM cache.
 
@@ -151,6 +167,7 @@ def __init__(
             snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
             behaviour.
         :param logger: InvokeAILogger to use (otherwise creates one)
+        :param keep_alive_minutes: How long to keep models in cache after last use (in minutes). 0 means keep indefinitely.
         """
         self._enable_partial_loading = enable_partial_loading
         self._keep_ram_copy_of_weights = keep_ram_copy_of_weights
@@ -182,6 +199,12 @@ def __init__(
         self._on_cache_miss_callbacks: set[CacheMissCallback] = set()
         self._on_cache_models_cleared_callbacks: set[CacheModelsClearedCallback] = set()
 
+        # Keep-alive timeout support
+        self._keep_alive_minutes = keep_alive_minutes
+        self._last_activity_time: Optional[float] = None
+        self._timeout_timer: Optional[threading.Timer] = None
+        self._shutdown_event = threading.Event()
+
     def on_cache_hit(self, cb: CacheHitCallback) -> Callable[[], None]:
         self._on_cache_hit_callbacks.add(cb)
 
@@ -190,7 +213,7 @@ def unsubscribe() -> None:
 
         return unsubscribe
 
-    def on_cache_miss(self, cb: CacheHitCallback) -> Callable[[], None]:
+    def on_cache_miss(self, cb: CacheMissCallback) -> Callable[[], None]:
         self._on_cache_miss_callbacks.add(cb)
 
         def unsubscribe() -> None:
@@ -218,7 +241,78 @@ def stats(self, stats: CacheStats) -> None:
         """Set the CacheStats object for collecting cache statistics."""
         self._stats = stats
 
+    def _record_activity(self) -> None:
+        """Record model activity and reset the timeout timer if configured.
+
+        Note: This method should only be called when self._lock is already held.
+        """
+        if self._keep_alive_minutes <= 0:
+            return
+
+        self._last_activity_time = time.time()
+
+        # Cancel any existing timer
+        if self._timeout_timer is not None:
+            self._timeout_timer.cancel()
+
+        # Start a new timer
+        timeout_seconds = self._keep_alive_minutes * 60
+        self._timeout_timer = threading.Timer(timeout_seconds, self._on_timeout)
+        # Set as daemon so it doesn't prevent application shutdown
+        self._timeout_timer.daemon = True
+        self._timeout_timer.start()
+        self._logger.debug(f"Model cache activity recorded. Timeout set to {self._keep_alive_minutes} minutes.")
+
     @synchronized
+    @record_activity
+    def _on_timeout(self) -> None:
+        """Called when the keep-alive timeout expires. Clears the model cache."""
+        if self._shutdown_event.is_set():
+            return
+
+        # Double-check if there has been activity since the timer was set
+        # This handles the race condition where activity occurred just before the timer fired
+        if self._last_activity_time is not None and self._keep_alive_minutes > 0:
+            elapsed_minutes = (time.time() - self._last_activity_time) / 60
+            if elapsed_minutes < self._keep_alive_minutes:
+                # Activity occurred, don't clear cache
+                self._logger.debug(
+                    f"Model cache timeout fired but activity detected {elapsed_minutes:.2f} minutes ago. "
+                    f"Skipping cache clear."
+                )
+                return
+
+        # Check if there are any unlocked models that can be cleared
+        unlocked_models = [key for key, entry in self._cached_models.items() if not entry.is_locked]
+
+        if len(unlocked_models) > 0:
+            self._logger.info(
+                f"Model cache keep-alive timeout of {self._keep_alive_minutes} minutes expired. "
+                f"Clearing {len(unlocked_models)} unlocked model(s) from cache."
+            )
+            # Clear the cache by requesting a very large amount of space.
+            # This is the same logic used by the "Clear Model Cache" button.
+            # Using 1000 GB ensures all unlocked models are removed.
+            self._make_room_internal(1000 * GB)
+        elif len(self._cached_models) > 0:
+            # All models are locked, don't log at info level
+            self._logger.debug(
+                f"Model cache timeout fired but all {len(self._cached_models)} model(s) are locked. "
+                f"Skipping cache clear."
+            )
+        else:
+            self._logger.debug("Model cache timeout fired but cache is already empty.")
+
+    @synchronized
+    def shutdown(self) -> None:
+        """Shutdown the model cache, cancelling any pending timers."""
+        self._shutdown_event.set()
+        if self._timeout_timer is not None:
+            self._timeout_timer.cancel()
+            self._timeout_timer = None
+
+    @synchronized
+    @record_activity
     def put(self, key: str, model: AnyModel) -> None:
         """Add a model to the cache."""
         if key in self._cached_models:
@@ -228,7 +322,7 @@ def put(self, key: str, model: AnyModel) -> None:
             return
 
         size = calc_model_size_by_data(self._logger, model)
-        self.make_room(size)
+        self._make_room_internal(size)
 
         # Inject custom modules into the model.
         if isinstance(model, torch.nn.Module):
@@ -272,6 +366,7 @@ def _get_cache_snapshot(self) -> dict[str, CacheEntrySnapshot]:
         return overview
 
     @synchronized
+    @record_activity
     def get(self, key: str, stats_name: Optional[str] = None) -> CacheRecord:
         """Retrieve a model from the cache.
 
@@ -309,9 +404,11 @@ def get(self, key: str, stats_name: Optional[str] = None) -> CacheRecord:
         self._logger.debug(f"Cache hit: {key} (Type: {cache_entry.cached_model.model.__class__.__name__})")
         for cb in self._on_cache_hit_callbacks:
             cb(model_key=key, cache_snapshot=self._get_cache_snapshot())
+
         return cache_entry
 
     @synchronized
+    @record_activity
     def lock(self, cache_entry: CacheRecord, working_mem_bytes: Optional[int]) -> None:
         """Lock a model for use and move it into VRAM."""
         if cache_entry.key not in self._cached_models:
@@ -348,6 +445,7 @@ def lock(self, cache_entry: CacheRecord, working_mem_bytes: Optional[int]) -> No
         self._log_cache_state()
 
     @synchronized
+    @record_activity
     def unlock(self, cache_entry: CacheRecord) -> None:
         """Unlock a model."""
         if cache_entry.key not in self._cached_models:
@@ -691,6 +789,10 @@ def make_room(self, bytes_needed: int) -> None:
         external references to the model, there's nothing that the cache can do about it, and those models will not be
         garbage-collected.
         """
+        self._make_room_internal(bytes_needed)
+
+    def _make_room_internal(self, bytes_needed: int) -> None:
+        """Internal implementation of make_room(). Assumes the lock is already held."""
         self._logger.debug(f"Making room for {bytes_needed / MB:.2f}MB of RAM.")
         self._log_cache_state(title="Before dropping models:")
 
diff --git a/invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py b/invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py
@@ -140,7 +140,6 @@ def _load_from_singlefile(
         # Some weights of the model checkpoint were not used when initializing CLIPTextModelWithProjection:
         # ['text_model.embeddings.position_ids']
 
-        self._logger.info(f"Loading model from single file at {config.path} using {load_class.__name__}")
         with SilenceWarnings():
             pipeline = load_class.from_single_file(config.path, torch_dtype=self._torch_dtype)
 
diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts
@@ -13036,6 +13036,7 @@ export type components = {
          *         max_cache_ram_gb: The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.
          *         max_cache_vram_gb: The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.
          *         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
+         *         model_cache_keep_alive_min: How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.
          *         device_working_mem_gb: The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.
          *         enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.
          *         keep_ram_copy_of_weights: Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.
@@ -13279,6 +13280,12 @@ export type components = {
              * @default false
              */
             log_memory_usage?: boolean;
+            /**
+             * Model Cache Keep Alive Min
+             * @description How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.
+             * @default 0
+             */
+            model_cache_keep_alive_min?: number;
             /**
              * Device Working Mem Gb
              * @description The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.
diff --git a/tests/backend/model_manager/load/model_cache/test_model_cache_timeout.py b/tests/backend/model_manager/load/model_cache/test_model_cache_timeout.py