fix workflow failures

takeshi-yoshimura · takeshi-yoshimura · commit 94324c8809a3 · 2025-06-22T10:01:55.000+09:00
Signed-off-by: Takeshi Yoshimura &lt;tyos@jp.ibm.com&gt;
diff --git a/.github/workflows/test-paddle.yaml b/.github/workflows/test-paddle.yaml
@@ -42,14 +42,10 @@ jobs:
           cd tests
           LIBDIR=`python3 -c "import os; os.chdir('/tmp'); import fastsafetensors; print(os.path.dirname(fastsafetensors.__file__))"`
           mkdir -p /tmp/pytest-log
-          export TEST_FASTSAFETENSORS_FRAMEWORK=torch
-          COVERAGE_FILE=.coverage_0 pytest -s --cov=${LIBDIR} test_fastsafetensors.py > /tmp/pytest-log/0.log 2>&1
-          COVERAGE_FILE=.coverage_1 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/pytest-log/1.log 2>&1 &
-          COVERAGE_FILE=.coverage_2 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=1 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/pytest-log/2.log 2>&1
           export TEST_FASTSAFETENSORS_FRAMEWORK=paddle
-          COVERAGE_FILE=.coverage_3 pytest -s --cov=${LIBDIR} test_fastsafetensors.py > /tmp/pytest-log/3.log 2>&1
-          COVERAGE_FILE=.coverage_4 WORLD_SIZE=2 python3 -m paddle.distributed.launch --nnodes 2 --master 127.0.0.1:1234 --rank 0 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/pytest-log/4.log 2>&1 & \
-          COVERAGE_FILE=.coverage_5 WORLD_SIZE=2 python3 -m paddle.distributed.launch --nnodes 2 --master 127.0.0.1:1234 --rank 1 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/pytest-log/5.log 2>&1 && \
+          COVERAGE_FILE=.coverage_0 pytest -s --cov=${LIBDIR} test_fastsafetensors.py > /tmp/pytest-log/0.log 2>&1
+          COVERAGE_FILE=.coverage_1 WORLD_SIZE=2 python3 -m paddle.distributed.launch --nnodes 2 --master 127.0.0.1:1234 --rank 0 tests/test_multi.py --cov=$(LIBDIR) -s tests/test_multi.py > /tmp/pytest-log/1.log 2>&1 & \
+          COVERAGE_FILE=.coverage_2 WORLD_SIZE=2 python3 -m paddle.distributed.launch --nnodes 2 --master 127.0.0.1:1234 --rank 1 tests/test_multi.py --cov=$(LIBDIR) -s tests/test_multi.py > /tmp/pytest-log/2.log 2>&1 && \
           coverage combine .coverage_*
           coverage html
           mv htmlcov /tmp/pytest-log
diff --git a/.github/workflows/test-torch.yaml b/.github/workflows/test-torch.yaml
@@ -58,8 +58,8 @@ jobs:
           mkdir -p /tmp/pytest-log
           export TEST_FASTSAFETENSORS_FRAMEWORK=pytorch
           COVERAGE_FILE=.coverage_0 pytest -s --cov=${LIBDIR} test_fastsafetensors.py > /tmp/pytest-log/0.log 2>&1
-          COVERAGE_FILE=.coverage_1 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/pytest-log/1.log 2>&1 &
-          COVERAGE_FILE=.coverage_2 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=1 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/pytest-log/2.log 2>&1
+          COVERAGE_FILE=.coverage_1 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 tests/test_multi.py --cov=$(LIBDIR) -s tests/test_multi.py > /tmp/pytest-log/1.log 2>&1 &
+          COVERAGE_FILE=.coverage_2 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=1 tests/test_multi.py --cov=$(LIBDIR) -s tests/test_multi.py > /tmp/pytest-log/2.log 2>&1
           coverage combine .coverage_*
           coverage html
           mv htmlcov /tmp/pytest-log
diff --git a/examples/run_parallel.py b/examples/run_parallel.py
@@ -7,6 +7,7 @@
 # PIDS+=$($!)
 # wait ${PIDS[@]}
 
+
 def run_torch():
     import torch
     import torch.distributed as dist
@@ -17,6 +18,7 @@ def run_torch():
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     return pg, device
 
+
 def run_paddle():
     import paddle
     import paddle.distributed as dist
@@ -27,13 +29,15 @@ def run_paddle():
     device = "gpu" if paddle.device.cuda.device_count() else "cpu"
     return pg, device
 
+
 runs = {
     "torch": run_torch,
     "paddle": run_paddle,
 }
 
 if __name__ == "__main__":
     import sys
+
     from fastsafetensors import SafeTensorsFileLoader
 
     framework = "torch"
diff --git a/examples/run_single.py b/examples/run_single.py
@@ -1,30 +1,41 @@
 #!/usr/bin/env python3
 
+
 def run_torch():
     import torch
+
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     return device
 
+
 def run_paddle():
     import paddle
+
     device = "gpu" if paddle.device.cuda.device_count() else "cpu"
     return device
 
+
 runs = {
     "torch": run_torch,
     "paddle": run_paddle,
 }
 
 if __name__ == "__main__":
     import sys
-    from fastsafetensors import fastsafe_open
+
     from fastsafetensors import cpp as fstcpp
+    from fastsafetensors import fastsafe_open
 
     framework = "torch"
     if len(sys.argv) > 1:
         framework = sys.argv[1]
 
     device = runs[framework]()
-    with fastsafe_open(["a.safetensors", "b.safetensors"], device=device, nogds=not fstcpp.is_cufile_found(), framework=framework) as f:
+    with fastsafe_open(
+        ["a.safetensors", "b.safetensors"],
+        device=device,
+        nogds=not fstcpp.is_cufile_found(),
+        framework=framework,
+    ) as f:
         print(f"a0: {f.get_tensor(name='a0')}")
         print(f"b0: {f.get_tensor(name='b0')}")
diff --git a/fastsafetensors/common.py b/fastsafetensors/common.py
@@ -10,7 +10,7 @@
 
 from . import cpp as fstcpp
 from .dlpack import from_cuda_buffer
-from .frameworks import TensorBase, FrameworkOpBase
+from .frameworks import FrameworkOpBase, TensorBase
 from .st_types import Device, DType
 
 
@@ -80,7 +80,9 @@ def __init__(
             )
 
     @classmethod
-    def from_buffer(self, buf: int, buffer_len: int, filename: str, framework: FrameworkOpBase):
+    def from_buffer(
+        self, buf: int, buffer_len: int, filename: str, framework: FrameworkOpBase
+    ):
         if buffer_len < 8:
             raise Exception(
                 f"from_buffer: HeaderTooSmall, filename={filename}, buffer_len={buffer_len}"
@@ -173,7 +175,9 @@ def get_tensors(
                 t2 = t2.view(t.dtype)
 
             if dtype != DType.AUTO and dtype != t.dtype:
-                if self.framework.get_dtype_size(dtype) > self.framework.get_dtype_size(t.dtype):
+                if self.framework.get_dtype_size(dtype) > self.framework.get_dtype_size(
+                    t.dtype
+                ):
                     raise Exception(
                         f"Online type conversion to larger sizes is not supported ({t.dtype} -> {dtype})"
                     )
diff --git a/fastsafetensors/copier/gds.py b/fastsafetensors/copier/gds.py
@@ -5,7 +5,7 @@
 
 from .. import cpp as fstcpp
 from ..common import SafeTensorsMetadata
-from ..frameworks import TensorBase, FrameworkOpBase
+from ..frameworks import FrameworkOpBase, TensorBase
 from ..st_types import Device, DeviceType, DType
 
 
diff --git a/fastsafetensors/copier/nogds.py b/fastsafetensors/copier/nogds.py
@@ -6,7 +6,7 @@
 
 from .. import cpp as fstcpp
 from ..common import SafeTensorsMetadata
-from ..frameworks import TensorBase, FrameworkOpBase
+from ..frameworks import FrameworkOpBase, TensorBase
 from ..st_types import Device, DType
 
 
diff --git a/fastsafetensors/file_buffer.py b/fastsafetensors/file_buffer.py
@@ -4,7 +4,7 @@
 from collections import OrderedDict
 from typing import Dict, List, Optional, Tuple
 
-from .frameworks import ProcessGroupBase, TensorBase, FrameworkOpBase
+from .frameworks import FrameworkOpBase, ProcessGroupBase, TensorBase
 from .st_types import Device, DType
 from .tensor_factory import LazyTensorFactory
 
diff --git a/fastsafetensors/frameworks/__init__.py b/fastsafetensors/frameworks/__init__.py
@@ -163,6 +163,7 @@ def randn(self, s: tuple, dtype: DType) -> T:
     def support_fp8(self) -> bool:
         pass
 
+
 def get_framework_op(name: str) -> FrameworkOpBase:
     if name == "pt" or name == "pytorch" or name == "torch":
         from ._torch import TorchOp
diff --git a/fastsafetensors/frameworks/_paddle.py b/fastsafetensors/frameworks/_paddle.py
@@ -4,8 +4,8 @@
 try:
     import paddle
     import paddle.distributed as pdist
-    from paddle.framework import core as paddle_core
     from paddle.distributed.communication.group import Group
+    from paddle.framework import core as paddle_core
 except ImportError as e:
     raise ImportError(
         "could not import paddle, paddle_core, or numpy. Please install them."
@@ -38,11 +38,12 @@
     DType.F8_E4M3: DType.I8,
 }
 
-if hasattr(paddle, 'float8_e5m2'):
+if hasattr(paddle, "float8_e5m2"):
     dtype_convert[DType.F8_E5M2] = paddle.float8_e5m2
-if hasattr(paddle, 'float8_e4m3fn'):
+if hasattr(paddle, "float8_e4m3fn"):
     dtype_convert[DType.F8_E4M3] = paddle.float8_e4m3fn
 
+
 @dataclass
 class PaddleTensor(TensorBase):
     real_tensor: paddle.Tensor
@@ -222,7 +223,9 @@ def as_workaround_dtype(self, dtype: DType) -> DType:
 
     def get_process_group(self, pg: Optional[Any]) -> PaddleProcessGroup:
         if pg is not None and not isinstance(pg, Group):
-            raise Exception("pg must be an instance of paddle.distributed.communication.group.Group")
+            raise Exception(
+                "pg must be an instance of paddle.distributed.communication.group.Group"
+            )
         return PaddleProcessGroup(pg)
 
     # for testing
@@ -232,7 +235,11 @@ def is_equal(self, wrapped: PaddleTensor, real: Any) -> bool:
         raise Exception("real is not paddle.Tensor")
 
     def randn(self, s: tuple, device: Device, dtype: DType) -> PaddleTensor:
-        return PaddleTensor(device, dtype, paddle.randn(s, dtype=dtype_convert[dtype]).to(device=device.as_str()))
+        return PaddleTensor(
+            device,
+            dtype,
+            paddle.randn(s, dtype=dtype_convert[dtype]).to(device=device.as_str()),
+        )
 
     def support_fp8(self) -> bool:
-        return DType.F8_E5M2 in dtype_convert
+        return DType.F8_E5M2 in dtype_convert
diff --git a/fastsafetensors/frameworks/_torch.py b/fastsafetensors/frameworks/_torch.py
@@ -31,15 +31,15 @@
     DType.F8_E4M3: DType.I8,
 }
 
-if hasattr(torch, 'float8_e5m2'):
+if hasattr(torch, "float8_e5m2"):
     dtype_convert[DType.F8_E5M2] = torch.float8_e5m2
-if hasattr(torch, 'float8_e4m3fn'):
+if hasattr(torch, "float8_e4m3fn"):
     dtype_convert[DType.F8_E4M3] = torch.float8_e4m3fn
-if hasattr(torch, 'uint16'):
+if hasattr(torch, "uint16"):
     dtype_convert[DType.U16] = torch.uint16
-if hasattr(torch, 'uint32'):
+if hasattr(torch, "uint32"):
     dtype_convert[DType.U32] = torch.uint32
-if hasattr(torch, 'uint64'):
+if hasattr(torch, "uint64"):
     dtype_convert[DType.U64] = torch.uint64
 
 
@@ -205,7 +205,11 @@ def is_equal(self, wrapped: TorchTensor, real: Any) -> bool:
         raise Exception("real is not torch.Tensor")
 
     def randn(self, s: tuple, device: Device, dtype: DType) -> TorchTensor:
-        return TorchTensor(device, dtype, torch.randn(s, device=device.as_str(), dtype=dtype_convert[dtype]))
+        return TorchTensor(
+            device,
+            dtype,
+            torch.randn(s, device=device.as_str(), dtype=dtype_convert[dtype]),
+        )
 
     def support_fp8(self) -> bool:
-        return DType.F8_E5M2 in dtype_convert
+        return DType.F8_E5M2 in dtype_convert
diff --git a/fastsafetensors/loader.py b/fastsafetensors/loader.py
@@ -6,10 +6,10 @@
 from typing import Any, Dict, List, Optional, OrderedDict, Tuple, Union
 
 from . import cpp as fstcpp
+from . import frameworks
 from .common import SafeTensorsMetadata, TensorFrame, get_device_numa_node
 from .file_buffer import FilesBufferOnDevice
-from . import frameworks
-from .frameworks import get_framework_op, TensorBase
+from .frameworks import TensorBase, get_framework_op
 from .st_types import DeviceType, DType
 from .tensor_factory import LazyTensorFactory
 
@@ -152,7 +152,9 @@ def copy_files_to_device(
                 need_wait.append(factory)
             lidx += 1
         for factory in need_wait:
-            factory.wait_io(dtype=dtype, noalign=isinstance(self.reader, fstcpp.nogds_file_reader))
+            factory.wait_io(
+                dtype=dtype, noalign=isinstance(self.reader, fstcpp.nogds_file_reader)
+            )
         return FilesBufferOnDevice(factories, pg=self.pg, framework=self.framework)
 
 
diff --git a/fastsafetensors/tensor_factory.py b/fastsafetensors/tensor_factory.py
@@ -8,7 +8,7 @@
 from .common import SafeTensorsMetadata
 from .copier.gds import GdsFileCopier
 from .copier.nogds import NoGdsFileCopier
-from .frameworks import ProcessGroupBase, TensorBase, FrameworkOpBase
+from .frameworks import FrameworkOpBase, ProcessGroupBase, TensorBase
 from .st_types import Device, DType
 
 
@@ -31,9 +31,13 @@ def __init__(
         self.copier: Optional[Union[NoGdsFileCopier, GdsFileCopier]] = None
         if local_rank:
             if isinstance(reader, fstcpp.nogds_file_reader):
-                self.copier = NoGdsFileCopier(metadata, device, reader, framework, debug_log)
+                self.copier = NoGdsFileCopier(
+                    metadata, device, reader, framework, debug_log
+                )
             else:
-                self.copier = GdsFileCopier(metadata, device, reader, framework, debug_log)
+                self.copier = GdsFileCopier(
+                    metadata, device, reader, framework, debug_log
+                )
         self.tensors: Dict[str, TensorBase] = {}
         self.shuffled: Dict[str, TensorBase] = {}
         self.gbuf: Optional[fstcpp.gds_device_buffer] = None
@@ -114,7 +118,9 @@ def shuffle(self, pg: ProcessGroupBase, tensor_name: str, dim: int) -> TensorBas
             if tensor_name in self.tensors:
                 dst = self.tensors[tensor_name].clone().detach()
             else:
-                dst = self.framework.get_empty_tensor(frame.shape, frame.dtype, self.device)
+                dst = self.framework.get_empty_tensor(
+                    frame.shape, frame.dtype, self.device
+                )
             if self.debug_log:
                 print(
                     f"shuffle: broadcast, tensor_name={tensor_name}, shape={frame.shape}, self.rank={self.rank}, pg.rank()={pg.rank()}, has_tensor={tensor_name in self.tensors}"
diff --git a/perf/fastsafetensors_perf/perf.py b/perf/fastsafetensors_perf/perf.py
@@ -14,9 +14,10 @@
 import torch
 import torch.distributed as dist
 import typer
-from fastsafetensors import SafeTensorsFileLoader, SingleGroup
 from safetensors import safe_open
 
+from fastsafetensors import SafeTensorsFileLoader, SingleGroup
+
 app = typer.Typer()
 
 script_path = __file__
@@ -188,19 +189,24 @@ def get_key_dim(
     #    ret[key] = dim
     return ret
 
+
 class MyProc:
-    def __init__(self, popen: Union[subprocess.Popen, None]=None):
+    def __init__(self, popen: Union[subprocess.Popen, None] = None):
         self.popen: Union[subprocess.Popen, None] = popen
+
     def terminate(self):
         if self.popen:
             self.popen.terminate()
+
     def kill(self):
         if self.popen:
             self.popen.kill()
+
     def wait(self, timeout=Union[int, None]):
         if self.popen:
             return self.popen.wait(timeout=timeout)
 
+
 mon_procs: Dict[int, Tuple[MyProc, MyProc, Any, str]] = {}
 
 
@@ -232,15 +238,19 @@ def start_sysstat(
         dool_cmd.append("--nvidia-gpu")
     dool_cmd += ["--output", dool_file]
     try:
-        dool = MyProc(subprocess.Popen(
-            dool_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
-        ))
+        dool = MyProc(
+            subprocess.Popen(
+                dool_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+            )
+        )
     except:
         dool = MyProc()
     try:
-        iostat = MyProc(subprocess.Popen(
-            ["iostat", "1"], stdout=iostat_f, stderr=subprocess.DEVNULL
-        ))
+        iostat = MyProc(
+            subprocess.Popen(
+                ["iostat", "1"], stdout=iostat_f, stderr=subprocess.DEVNULL
+            )
+        )
     except:
         iostat = MyProc()
     id = len(mon_procs)
@@ -270,8 +280,11 @@ def as_safetensors_dtype(dtype_str: str) -> Union[str, None]:
     if dtype_str == "auto":
         return None
     from fastsfaetensors.common import TYPE_MAP
+
     if dtype_str not in TYPE_MAP:
-        raise Exception(f"unsupported type: {dtype_str}. supported types: {TYPE_MAP.keys()}")
+        raise Exception(
+            f"unsupported type: {dtype_str}. supported types: {TYPE_MAP.keys()}"
+        )
     return dtype_str
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/test_fastsafetensors.py b/tests/test_fastsafetensors.py
diff --git a/tests/test_multi.py b/tests/test_multi.py