Skip to content

Commit b9493dd

Browse files
authored
Workaround for Windows being unable to remove tmp directories when installing GGUF files (#8699)
* (bugfix)(mm) work around Windows being unable to rmtree tmp directories after GGUF install * (style) fix ruff error * (fix) add workaround for Windows Permission Denied on GGUF file move() call * (fix) perform torch copy() in GGUF reader to avoid deletion failures on Windows * (style) fix ruff formatting issues
1 parent ddb85ca commit b9493dd

File tree

2 files changed

+59
-4
lines changed

2 files changed

+59
-4
lines changed

invokeai/app/services/model_install/model_install_default.py

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
"""Model installation class."""
22

3+
import gc
34
import locale
45
import os
56
import re
7+
import sys
68
import threading
79
import time
810
from copy import deepcopy
@@ -187,6 +189,22 @@ def register_path(
187189
config.source_type = ModelSourceType.Path
188190
return self._register(model_path, config)
189191

192+
# TODO: Replace this with a proper fix for underlying problem of Windows holding open
193+
# the file when it needs to be moved.
194+
@staticmethod
195+
def _move_with_retries(src: Path, dst: Path, attempts: int = 5, delay: float = 0.5) -> None:
196+
"""Workaround for Windows file-handle issues when moving files."""
197+
for tries_left in range(attempts, 0, -1):
198+
try:
199+
move(src, dst)
200+
return
201+
except PermissionError:
202+
gc.collect()
203+
if tries_left == 1:
204+
raise
205+
time.sleep(delay)
206+
delay *= 2 # Exponential backoff
207+
190208
def install_path(
191209
self,
192210
model_path: Union[Path, str],
@@ -205,7 +223,7 @@ def install_path(
205223
dest_dir.mkdir(parents=True)
206224
dest_path = dest_dir / model_path.name if model_path.is_file() else dest_dir
207225
if model_path.is_file():
208-
move(model_path, dest_path)
226+
self._move_with_retries(model_path, dest_path) # Windows workaround TODO: fix root cause
209227
elif model_path.is_dir():
210228
# Move the contents of the directory, not the directory itself
211229
for item in model_path.iterdir():
@@ -500,6 +518,39 @@ def _start_installer_thread(self) -> None:
500518
self._install_thread.start()
501519
self._running = True
502520

521+
@staticmethod
522+
def _safe_rmtree(path: Path, logger: Any) -> None:
523+
"""Remove a directory tree with retry logic for Windows file locking issues.
524+
525+
On Windows, memory-mapped files may not be immediately released even after
526+
the file handle is closed. This function retries the removal with garbage
527+
collection to help release any lingering references.
528+
"""
529+
max_retries = 3
530+
retry_delay = 0.5 # seconds
531+
532+
for attempt in range(max_retries):
533+
try:
534+
# Force garbage collection to release any lingering file references
535+
gc.collect()
536+
rmtree(path)
537+
return
538+
except PermissionError as e:
539+
if attempt < max_retries - 1 and sys.platform == "win32":
540+
logger.warning(
541+
f"Failed to remove {path} (attempt {attempt + 1}/{max_retries}): {e}. "
542+
f"Retrying in {retry_delay}s..."
543+
)
544+
time.sleep(retry_delay)
545+
retry_delay *= 2 # Exponential backoff
546+
else:
547+
logger.error(f"Failed to remove temporary directory {path}: {e}")
548+
# On final failure, don't raise - the temp dir will be cleaned up on next startup
549+
return
550+
except Exception as e:
551+
logger.error(f"Unexpected error removing {path}: {e}")
552+
return
553+
503554
def _install_next_item(self) -> None:
504555
self._logger.debug(f"Installer thread {threading.get_ident()} starting")
505556
while True:
@@ -529,7 +580,7 @@ def _install_next_item(self) -> None:
529580
finally:
530581
# if this is an install of a remote file, then clean up the temporary directory
531582
if job._install_tmpdir is not None:
532-
rmtree(job._install_tmpdir)
583+
self._safe_rmtree(job._install_tmpdir, self._logger)
533584
self._install_completed_event.set()
534585
self._install_queue.task_done()
535586
self._logger.info(f"Installer thread {threading.get_ident()} exiting")
@@ -574,7 +625,7 @@ def _remove_dangling_install_dirs(self) -> None:
574625
path = self._app_config.models_path
575626
for tmpdir in path.glob(f"{TMPDIR_PREFIX}*"):
576627
self._logger.info(f"Removing dangling temporary directory {tmpdir}")
577-
rmtree(tmpdir)
628+
self._safe_rmtree(tmpdir, self._logger)
578629

579630
def _scan_for_missing_models(self) -> list[AnyModelConfig]:
580631
"""Scan the models directory for missing models and return a list of them."""

invokeai/backend/quantization/gguf/loaders.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,11 @@ def gguf_sd_loader(path: Path, compute_dtype: torch.dtype) -> dict[str, GGMLTens
4040
with WrappedGGUFReader(path) as reader:
4141
sd: dict[str, GGMLTensor] = {}
4242
for tensor in reader.tensors:
43-
torch_tensor = torch.from_numpy(tensor.data)
43+
# Use .copy() to create a true copy of the data, not a view.
44+
# This is critical on Windows where the memory-mapped file cannot be deleted
45+
# while tensors still hold references to the mapped memory.
46+
torch_tensor = torch.from_numpy(tensor.data.copy())
47+
4448
shape = torch.Size(tuple(int(v) for v in reversed(tensor.shape)))
4549
if tensor.tensor_type in TORCH_COMPATIBLE_QTYPES:
4650
torch_tensor = torch_tensor.view(*shape)

0 commit comments

Comments
 (0)