foundation-model-stack
diff --git a/‎fastsafetensors/__init__.py‎
Lines changed: 8 additions & 3 deletions b/‎fastsafetensors/__init__.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎fastsafetensors/common.py‎
Lines changed: 247 additions & 137 deletions b/‎fastsafetensors/common.py‎
Lines changed: 247 additions & 137 deletions
diff --git a/‎fastsafetensors/copier/gds.py‎
Lines changed: 71 additions & 29 deletions b/‎fastsafetensors/copier/gds.py‎
Lines changed: 71 additions & 29 deletions
diff --git a/‎fastsafetensors/copier/nogds.py‎
Lines changed: 33 additions & 10 deletions b/‎fastsafetensors/copier/nogds.py‎
Lines changed: 33 additions & 10 deletions
diff --git a/‎fastsafetensors/cpp.pyi‎
Lines changed: 56 additions & 0 deletions b/‎fastsafetensors/cpp.pyi‎
Lines changed: 56 additions & 0 deletions
@@ -1,8 +1,13 @@
 # Copyright 2024 IBM Inc. All rights reserved
 # SPDX-License-Identifier: Apache-2.0
 
-from .common import (SafeTensorsMetadata, SingleGroup, TensorFrame,
-                     alloc_tensor_memory, free_tensor_memory,
-                     get_device_numa_node, str_to_dtype)
+from .common import (
+    SafeTensorsMetadata,
+    TensorFrame,
+    alloc_tensor_memory,
+    free_tensor_memory,
+    get_device_numa_node,
+)
 from .file_buffer import FilesBufferOnDevice
 from .loader import SafeTensorsFileLoader, fastsafe_open
+from .st_types import SingleGroup, STDevice, STDeviceType, STEnv
@@ -1,40 +1,55 @@
 # Copyright 2024 IBM Inc. All rights reserved
 # SPDX-License-Identifier: Apache-2.0
 
+from typing import Dict, Optional
+
 import torch
+
 from .. import cpp as fstcpp
-from typing import Dict
-from ..common import alloc_tensor_memory, free_tensor_memory, SafeTensorsMetadata, ALIGN, CUDA_PTR_ALIGN, paddle_loaded
-if paddle_loaded:
-    import paddle
+from ..common import (
+    ALIGN,
+    CUDA_PTR_ALIGN,
+    CUDA_VER,
+    SafeTensorsMetadata,
+    alloc_tensor_memory,
+    free_tensor_memory,
+)
+from ..st_types import STDevice, STDeviceType, STDType
+
 
 class GdsFileCopier:
-    def __init__(self, metadata: SafeTensorsMetadata, device: torch.device, reader: fstcpp.gds_file_reader, debug_log: bool=False):
+    def __init__(
+        self,
+        metadata: SafeTensorsMetadata,
+        device: STDevice,
+        reader: fstcpp.gds_file_reader,
+        debug_log: bool = False,
+    ):
         self.metadata = metadata
         self.device = device
         self.reader = reader
         self.debug_log = debug_log
         self.gbuf = None
-        self.fh = 0
+        self.fh: Optional[fstcpp.gds_file_handle] = None
         self.copy_reqs: Dict[int, int] = {}
         self.aligned_length = 0
-        try:
-            if self.metadata.framework == "pytorch":
-                cuda_vers_list = torch.version.cuda.split('.')
-            elif paddle_loaded and self.metadata.framework == "paddle":
-                cuda_vers_list = paddle.version.cuda().split('.')
-            cudavers = list(map(int, cuda_vers_list))
-            # CUDA 12.2 (GDS version 1.7) introduces support for non O_DIRECT file descriptors
-            # Compatible with CUDA 11.x
-            self.o_direct = not (cudavers[0] > 12 or (cudavers[0] == 12 and cudavers[1] >= 2))
-        except:
-            self.o_direct = True
+        cudavers = list(map(int, CUDA_VER.split(".")))
+        # CUDA 12.2 (GDS version 1.7) introduces support for non O_DIRECT file descriptors
+        # Compatible with CUDA 11.x
+        self.o_direct = not (
+            cudavers[0] > 12 or (cudavers[0] == 12 and cudavers[1] >= 2)
+        )
 
     def set_o_direct(self, enable: bool):
         self.o_direct = enable
 
-    def submit_io(self, use_buf_register: bool, max_copy_block_size: int)->fstcpp.gds_device_buffer:
-        dev_is_cuda = (self.metadata.framework == "pytorch" and self.device.type == 'cuda') or (paddle_loaded and self.metadata.framework == "paddle" and "gpu" in self.device)
+    def submit_io(
+        self, use_buf_register: bool, max_copy_block_size: int
+    ) -> fstcpp.gds_device_buffer:
+        dev_is_cuda = (
+            self.device.type == STDeviceType.CUDA
+            or self.device.type == STDeviceType.GPU
+        )
         self.fh = fstcpp.gds_file_handle(self.metadata.src, self.o_direct, dev_is_cuda)
         offset = self.metadata.header_length
         length = self.metadata.size_bytes - self.metadata.header_length
@@ -55,7 +70,11 @@ def submit_io(self, use_buf_register: bool, max_copy_block_size: int)->fstcpp.gd
                 if req_len > max_copy_block_size:
                     req_len = max_copy_block_size
                 if gbuf.cufile_register(count, req_len) < 0:
-                    raise Exception("submit_io: register_buffer failed, ptr=0x{:x}, count={}, len={}".format(gbuf.get_base_address(), count, req_len))
+                    raise Exception(
+                        "submit_io: register_buffer failed, ptr=0x{:x}, count={}, len={}".format(
+                            gbuf.get_base_address(), count, req_len
+                        )
+                    )
                 count += req_len
 
         count = 0
@@ -64,40 +83,63 @@ def submit_io(self, use_buf_register: bool, max_copy_block_size: int)->fstcpp.gd
             if req_len > max_copy_block_size:
                 req_len = max_copy_block_size
             # TODO: pass timeout so that wait_copy_tensors can recognize too slow pread()
-            req = self.reader.submit_read(self.fh, gbuf, aligned_offset + count, req_len, count, self.metadata.size_bytes)
+            req = self.reader.submit_read(
+                self.fh,
+                gbuf,
+                aligned_offset + count,
+                req_len,
+                count,
+                self.metadata.size_bytes,
+            )
             self.copy_reqs[req] = -1 if not use_buf_register else count
             count += req_len
         self.aligned_offset = aligned_offset
         self.aligned_length = aligned_length
         return gbuf
 
-    def wait_io(self, gbuf: fstcpp.gds_device_buffer, dtype: torch.dtype=None, noalign: bool=False)->Dict[str, torch.Tensor]:
+    def wait_io(
+        self,
+        gbuf: fstcpp.gds_device_buffer,
+        dtype: STDType = STDType.AUTO,
+        noalign: bool = False,
+    ) -> Dict[str, torch.Tensor]:
         failed = []
-        for req, c in sorted(self.copy_reqs.items(), key=lambda x:x[0]):
+        for req, c in sorted(self.copy_reqs.items(), key=lambda x: x[0]):
             count = self.reader.wait_read(req)
             if count < 0:
                 failed.append(req)
             if c != -1:
                 gbuf.cufile_deregister(c)
-        if self.fh != 0:
+        if self.fh is not None:
             del self.fh
-            self.fh = 0
+            self.fh = None
         if len(failed) > 0:
-            raise Exception(f"wait_io: wait_gds_read failed, failed={failed}, reqs={self.copy_reqs}")
+            raise Exception(
+                f"wait_io: wait_gds_read failed, failed={failed}, reqs={self.copy_reqs}"
+            )
         self.copy_reqs = {}
         if not noalign and not self.metadata.aligned and self.aligned_length > 0:
             misaligned_bytes = self.metadata.header_length % CUDA_PTR_ALIGN
-            length = 1024*1024*1024
+            length = 1024 * 1024 * 1024
             tmp_gbuf = alloc_tensor_memory(length, self.device, self.metadata.framework)
             count = 0
             while count + misaligned_bytes < self.aligned_length:
                 l = self.aligned_length - misaligned_bytes - count
                 if l > length:
                     l = length
                 if self.debug_log:
-                    print("wait_io: fix misalignment, src=0x{:x}, misaligned_bytes={}, count={}, tmp=0x{:x}".format(gbuf.get_base_address(), misaligned_bytes, count, tmp_gbuf.get_base_address()))
+                    print(
+                        "wait_io: fix misalignment, src=0x{:x}, misaligned_bytes={}, count={}, tmp=0x{:x}".format(
+                            gbuf.get_base_address(),
+                            misaligned_bytes,
+                            count,
+                            tmp_gbuf.get_base_address(),
+                        )
+                    )
                 gbuf.memmove(count, misaligned_bytes + count, tmp_gbuf, l)
                 count += l
             free_tensor_memory(tmp_gbuf, self.device, self.metadata.framework)
             self.aligned_offset += misaligned_bytes
-        return self.metadata.get_tensors(gbuf, self.device, self.aligned_offset, dtype=dtype)
+        return self.metadata.get_tensors(
+            gbuf, self.device, self.aligned_offset, dtype=dtype
+        )
@@ -1,44 +1,67 @@
 # Copyright 2024 IBM Inc. All rights reserved
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
 import os
+from typing import Dict, List
+
+import torch
+
 from .. import cpp as fstcpp
-from typing import Dict
-from ..common import alloc_tensor_memory, SafeTensorsMetadata, ALIGN, CUDA_PTR_ALIGN
+from ..common import ALIGN, CUDA_PTR_ALIGN, SafeTensorsMetadata, alloc_tensor_memory
+from ..st_types import STDevice, STDType
+
 
 class NoGdsFileCopier:
-    def __init__(self, metadata: SafeTensorsMetadata, device: torch.device, reader: fstcpp.nogds_file_reader, debug_log: bool=False):
+    def __init__(
+        self,
+        metadata: SafeTensorsMetadata,
+        device: STDevice,
+        reader: fstcpp.nogds_file_reader,
+        debug_log: bool = False,
+    ):
         self.metadata = metadata
         self.reader = reader
         self.fd = os.open(metadata.src, os.O_RDONLY, 0o644)
         if self.fd < 0:
-            raise Exception(f"NoGdsFileCopier.__init__: failed to open, file={metadata.src}")
+            raise Exception(
+                f"NoGdsFileCopier.__init__: failed to open, file={metadata.src}"
+            )
         self.device = device
         self.debug_log = debug_log
-        self.reqs = []
+        self.reqs: List[int] = []
 
-    def submit_io(self, use_buf_register: bool, max_copy_block_size: int)->fstcpp.gds_device_buffer:
+    def submit_io(
+        self, use_buf_register: bool, max_copy_block_size: int
+    ) -> fstcpp.gds_device_buffer:
         total_length = self.metadata.size_bytes - self.metadata.header_length
         gbuf = alloc_tensor_memory(total_length, self.device, self.metadata.framework)
         count = 0
         while count < total_length:
             l = total_length - count
             if max_copy_block_size < l:
                 l = max_copy_block_size
-            req = self.reader.submit_read(self.fd, gbuf, self.metadata.header_length + count, l, count)
+            req = self.reader.submit_read(
+                self.fd, gbuf, self.metadata.header_length + count, l, count
+            )
             if req < 0:
                 raise Exception(f"submit_io: submit_nogds_read failed, err={req}")
             self.reqs.append(req)
             count += l
         return gbuf
 
-    def wait_io(self, gbuf: fstcpp.gds_device_buffer, dtype: torch.dtype=None, noalign: bool=False)->Dict[str, torch.Tensor]:
+    def wait_io(
+        self,
+        gbuf: fstcpp.gds_device_buffer,
+        dtype: STDType = STDType.AUTO,
+        noalign: bool = False,
+    ) -> Dict[str, torch.Tensor]:
         for req in self.reqs:
             count = self.reader.wait_read(req)
             if count < 0:
                 raise Exception(f"wait_io: wait_nogds_read failed, req={req}")
         if self.fd > 0:
             os.close(self.fd)
             self.fd = 0
-        return self.metadata.get_tensors(gbuf, self.device, self.metadata.header_length, dtype=dtype)
+        return self.metadata.get_tensors(
+            gbuf, self.device, self.metadata.header_length, dtype=dtype
+        )
@@ -0,0 +1,56 @@
+# Copyright 2025 IBM Inc. All rights reserved
+# SPDX-License-Identifier: Apache-2.0
+
+# fastsafetensors/cpp.pyi
+
+class gds_device_buffer:
+    def __init__(self, devPtr_base: int, length: int, use_cuda: bool) -> None: ...
+    def cufile_register(self, offset: int, length: int) -> int: ...
+    def cufile_deregister(self, offset: int) -> int: ...
+    def memmove(
+        self, dst_off: int, src_off: int, tmp: "gds_device_buffer", length: int
+    ) -> int: ...
+    def get_base_address(self) -> int: ...
+
+class nogds_file_reader:
+    def __init__(
+        self, use_mmap: bool, bbuf_size_kb: int, max_threads: int, use_cuda: bool
+    ) -> None: ...
+    def submit_read(
+        self, fd: int, dst: gds_device_buffer, offset: int, length: int, ptr_off: int
+    ) -> int: ...
+    def wait_read(self, thread_id: int) -> int: ...
+
+class gds_file_handle:
+    def __init__(self, filename: str, o_direct: bool, use_cuda: bool) -> None: ...
+
+class gds_file_reader:
+    def __init__(self, max_threads: int, use_cuda: bool) -> None: ...
+    def submit_read(
+        self,
+        fh: gds_file_handle,
+        dst: gds_device_buffer,
+        offset: int,
+        length: int,
+        ptr_off: int,
+        file_length: int,
+    ) -> int: ...
+    def wait_read(self, id: int) -> int: ...
+
+def is_cuda_found() -> bool: ...
+def is_cufile_found() -> bool: ...
+def cufile_version() -> int: ...
+def get_alignment_size() -> int: ...
+def set_debug_log(debug_log: bool) -> None: ...
+def init_gds(
+    max_direct_io_size_in_kb: int, max_pinned_memory_size_in_kb: int
+) -> int: ...
+def close_gds() -> int: ...
+def get_device_pci_bus(deviceId: int) -> str: ...
+def set_numa_node(numa_node: int) -> int: ...
+def read_buffer(dst: int, length: int) -> bytes: ...
+def cpu_malloc(length: int) -> int: ...
+def cpu_free(addr: int) -> None: ...
+def gpu_malloc(length: int) -> int: ...
+def gpu_free(addr: int) -> None: ...
+def load_nvidia_functions() -> None: ...