Support max_shard_size as string in split_state_dict_into_shards_factory (#2286)

SunMarc · Wauplin · Wauplin · commit 20f821b76a51 · 2024-05-27T10:29:04.000+02:00
* fix max-shard-size

* for torch also

* add tests

* Fix styling + do not support KiB

---------

Co-authored-by: Lucain Pouget &lt;lucainp@gmail.com&gt;
diff --git a/src/huggingface_hub/serialization/_base.py b/src/huggingface_hub/serialization/_base.py
@@ -14,7 +14,7 @@
 """Contains helpers to split tensors into shards."""
 
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, TypeVar
+from typing import Any, Callable, Dict, List, Optional, TypeVar, Union
 
 from .. import logging
 
@@ -46,7 +46,7 @@ def split_state_dict_into_shards_factory(
     get_tensor_size: TensorSizeFn_T,
     get_storage_id: StorageIDFn_T = lambda tensor: None,
     filename_pattern: str = FILENAME_PATTERN,
-    max_shard_size: int = MAX_SHARD_SIZE,
+    max_shard_size: Union[int, str] = MAX_SHARD_SIZE,
 ) -> StateDictSplit:
     """
     Split a model state dictionary in shards so that each shard is smaller than a given size.
@@ -89,6 +89,9 @@ def split_state_dict_into_shards_factory(
     current_shard_size = 0
     total_size = 0
 
+    if isinstance(max_shard_size, str):
+        max_shard_size = parse_size_to_int(max_shard_size)
+
     for key, tensor in state_dict.items():
         # when bnb serialization is used the weights in the state dict can be strings
         # check: https://github.com/huggingface/transformers/pull/24416 for more details
@@ -167,3 +170,44 @@ def split_state_dict_into_shards_factory(
         filename_to_tensors=filename_to_tensors,
         tensor_to_filename=tensor_name_to_filename,
     )
+
+
+SIZE_UNITS = {
+    "TB": 10**12,
+    "GB": 10**9,
+    "MB": 10**6,
+    "KB": 10**3,
+}
+
+
+def parse_size_to_int(size_as_str: str) -> int:
+    """
+    Parse a size expressed as a string with digits and unit (like `"5MB"`) to an integer (in bytes).
+
+    Supported units are "TB", "GB", "MB", "KB".
+
+    Args:
+        size_as_str (`str`): The size to convert. Will be directly returned if an `int`.
+
+    Example:
+
+    ```py
+    >>> parse_size_to_int("5MB")
+    5000000
+    ```
+    """
+    size_as_str = size_as_str.strip()
+
+    # Parse unit
+    unit = size_as_str[-2:].upper()
+    if unit not in SIZE_UNITS:
+        raise ValueError(f"Unit '{unit}' not supported. Supported units are TB, GB, MB, KB. Got '{size_as_str}'.")
+    multiplier = SIZE_UNITS[unit]
+
+    # Parse value
+    try:
+        value = float(size_as_str[:-2].strip())
+    except ValueError as e:
+        raise ValueError(f"Could not parse the size value from '{size_as_str}': {e}") from e
+
+    return int(value * multiplier)
diff --git a/src/huggingface_hub/serialization/_numpy.py b/src/huggingface_hub/serialization/_numpy.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """Contains numpy-specific helpers."""
 
-from typing import TYPE_CHECKING, Dict
+from typing import TYPE_CHECKING, Dict, Union
 
 from ._base import FILENAME_PATTERN, MAX_SHARD_SIZE, StateDictSplit, split_state_dict_into_shards_factory
 
@@ -26,7 +26,7 @@ def split_numpy_state_dict_into_shards(
     state_dict: Dict[str, "np.ndarray"],
     *,
     filename_pattern: str = FILENAME_PATTERN,
-    max_shard_size: int = MAX_SHARD_SIZE,
+    max_shard_size: Union[int, str] = MAX_SHARD_SIZE,
 ) -> StateDictSplit:
     """
     Split a model state dictionary in shards so that each shard is smaller than a given size.
diff --git a/src/huggingface_hub/serialization/_tensorflow.py b/src/huggingface_hub/serialization/_tensorflow.py
@@ -15,7 +15,7 @@
 
 import math
 import re
-from typing import TYPE_CHECKING, Dict
+from typing import TYPE_CHECKING, Dict, Union
 
 from ._base import MAX_SHARD_SIZE, StateDictSplit, split_state_dict_into_shards_factory
 
@@ -28,7 +28,7 @@ def split_tf_state_dict_into_shards(
     state_dict: Dict[str, "tf.Tensor"],
     *,
     filename_pattern: str = "tf_model{suffix}.h5",
-    max_shard_size: int = MAX_SHARD_SIZE,
+    max_shard_size: Union[int, str] = MAX_SHARD_SIZE,
 ) -> StateDictSplit:
     """
     Split a model state dictionary in shards so that each shard is smaller than a given size.
diff --git a/src/huggingface_hub/serialization/_torch.py b/src/huggingface_hub/serialization/_torch.py
@@ -15,7 +15,7 @@
 
 import importlib
 from functools import lru_cache
-from typing import TYPE_CHECKING, Dict, Tuple
+from typing import TYPE_CHECKING, Dict, Tuple, Union
 
 from ._base import FILENAME_PATTERN, MAX_SHARD_SIZE, StateDictSplit, split_state_dict_into_shards_factory
 
@@ -28,7 +28,7 @@ def split_torch_state_dict_into_shards(
     state_dict: Dict[str, "torch.Tensor"],
     *,
     filename_pattern: str = FILENAME_PATTERN,
-    max_shard_size: int = MAX_SHARD_SIZE,
+    max_shard_size: Union[int, str] = MAX_SHARD_SIZE,
 ) -> StateDictSplit:
     """
     Split a model state dictionary in shards so that each shard is smaller than a given size.
@@ -67,7 +67,7 @@ def split_torch_state_dict_into_shards(
 
     >>> def save_state_dict(state_dict: Dict[str, torch.Tensor], save_directory: str):
     ...     state_dict_split = split_torch_state_dict_into_shards(state_dict)
-    ...     for filename, tensors in state_dict_split.filename_to_tensors.values():
+    ...     for filename, tensors in state_dict_split.filename_to_tensors.items():
     ...         shard = {tensor: state_dict[tensor] for tensor in tensors}
     ...         safe_save_file(
     ...             shard,
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
@@ -1,4 +1,7 @@
+import pytest
+
 from huggingface_hub.serialization import split_state_dict_into_shards_factory
+from huggingface_hub.serialization._base import parse_size_to_int
 from huggingface_hub.serialization._numpy import get_tensor_size as get_tensor_size_numpy
 from huggingface_hub.serialization._tensorflow import get_tensor_size as get_tensor_size_tensorflow
 from huggingface_hub.serialization._torch import get_tensor_size as get_tensor_size_torch
@@ -123,3 +126,17 @@ def test_get_tensor_size_torch():
 
     assert get_tensor_size_torch(torch.tensor([1, 2, 3, 4, 5], dtype=torch.float64)) == 5 * 8
     assert get_tensor_size_torch(torch.tensor([1, 2, 3, 4, 5], dtype=torch.float16)) == 5 * 2
+
+
+def test_parse_size_to_int():
+    assert parse_size_to_int("1KB") == 1 * 10**3
+    assert parse_size_to_int("2MB") == 2 * 10**6
+    assert parse_size_to_int("3GB") == 3 * 10**9
+    assert parse_size_to_int(" 10 KB ") == 10 * 10**3  # ok with whitespace
+    assert parse_size_to_int("20mb") == 20 * 10**6  # ok with lowercase
+
+    with pytest.raises(ValueError, match="Unit 'IB' not supported"):
+        parse_size_to_int("1KiB")  # not a valid unit
+
+    with pytest.raises(ValueError, match="Could not parse the size value"):
+        parse_size_to_int("1ooKB")  # not a float