Force loading tensors on default stream

turboderp · turboderp · commit 1e462f1f7f72 · 2024-09-04T23:04:22.000+02:00
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -2,7 +2,7 @@
 
 import torch
 import math
-from exllamav2.fasttensors import STFile
+from exllamav2.fasttensors import STFile, cleanup_stfiles
 from exllamav2.architecture import ExLlamaV2ArchParams
 import os, glob, json
 from typing import Any, Dict, List, TypeVar, Union, cast
@@ -370,7 +370,7 @@ def prepare(self, no_tensors: bool = False):
             if not match:
                 raise ValueError(f" ## Could not find {prefix}.* in model")
 
-        x = 0
+        cleanup_stfiles()
 
 
     def arch_compat_overrides(self, quiet: bool = False, warn_only = False):
diff --git a/exllamav2/fasttensors.py b/exllamav2/fasttensors.py
@@ -191,6 +191,9 @@ def get_tensor(self,
 
         torch.cuda.synchronize()
 
+        if device != "cpu":
+            torch.cuda.set_stream(torch.cuda.default_stream(device))
+
         if self.tensor_remap and (not_fast or not self.fast):
             key = self.tensor_remap[key]
 
@@ -213,8 +216,6 @@ def get_tensor(self,
             size = end - beg
             numel = size // esize
             shape = h["shape"]
-            if device != "cpu":
-                torch.cuda.set_stream(torch.cuda.default_stream(device))
             tensor = torch.zeros(shape, dtype = dtype, device = device)
             assert tensor.is_contiguous, "Non-contiguous tensor"
             ext_c.safetensors_read_fb(self.handle_fb, beg + self.header_size, size, tensor)
diff --git a/exllamav2/linear.py b/exllamav2/linear.py
@@ -597,6 +597,8 @@ def tp_split(self, broadcast_type: int, dim = None):
                 )
             )
 
+        torch.cuda.synchronize()
+
         ext_c.free_q_matrix(self.q_handle)
         self.q_handle = new_q_handle
         self.q_tensors = new_q_tensors

Original file line number	Diff line number	Diff line change
`@@ -597,6 +597,8 @@ def tp_split(self, broadcast_type: int, dim = None):`
`597`	`597`	`)`
`598`	`598`	`)`
`599`	`599`
	`600`	`+ torch.cuda.synchronize()`
	`601`	`+`
`600`	`602`	`ext_c.free_q_matrix(self.q_handle)`
`601`	`603`	`self.q_handle = new_q_handle`
`602`	`604`	`self.q_tensors = new_q_tensors`