Merge branch 'main' into stable

city96 · city96 · commit ca582bcb6d1e · 2025-11-03T15:58:26.000+01:00
diff --git a/nodes.py b/nodes.py
@@ -76,16 +76,29 @@ def unpatch_model(self, device_to=None, unpatch_weights=True):
         # TODO: Find another way to not unload after patches
         return super().unpatch_model(device_to=device_to, unpatch_weights=unpatch_weights)
 
+
+    def pin_weight_to_device(self, key):
+        op_key = key.rsplit('.', 1)[0]
+        if self.named_modules_to_munmap is not None and op_key in self.named_modules_to_munmap:
+            # TODO: possible to OOM, find better way to detach
+            self.named_modules_to_munmap[op_key].to(self.load_device).to(self.offload_device)
+            del self.named_modules_to_munmap[op_key]
+        super().pin_weight_to_device(key)
+
     mmap_released = False
+
     def load(self, *args, force_patch_weights=False, **kwargs):
+        if not self.mmap_released:
+            self.named_modules_to_munmap = dict(self.model.named_modules())
+
         # always call `patch_weight_to_device` even for lowvram
         super().load(*args, force_patch_weights=True, **kwargs)
 
         # make sure nothing stays linked to mmap after first load
         if not self.mmap_released:
             linked = []
             if kwargs.get("lowvram_model_memory", 0) > 0:
-                for n, m in self.model.named_modules():
+                for n, m in self.named_modules_to_munmap.items():
                     if hasattr(m, "weight"):
                         device = getattr(m.weight, "device", None)
                         if device == self.offload_device:
@@ -102,6 +115,7 @@ def load(self, *args, force_patch_weights=False, **kwargs):
                     # TODO: possible to OOM, find better way to detach
                     m.to(self.load_device).to(self.offload_device)
             self.mmap_released = True
+        self.named_modules_to_munmap = None
 
     def clone(self, *args, **kwargs):
         src_cls = self.__class__
diff --git a/ops.py b/ops.py
@@ -153,7 +153,7 @@ def ggml_save_to_state_dict(self, destination, prefix, keep_vars):
         # Take into account space required for dequantizing the largest tensor
         if self.largest_layer:
             shape = getattr(self.weight, "tensor_shape", self.weight.shape)
-            dtype = self.dequant_dtype or torch.float16
+            dtype = self.dequant_dtype if self.dequant_dtype and self.dequant_dtype != "target" else torch.float16
             temp = torch.empty(*shape, device=torch.device("meta"), dtype=dtype)
             destination[prefix + "temp.weight"] = temp