Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 59 additions & 114 deletions src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from zipfile import is_zipfile

import torch
from huggingface_hub import split_torch_state_dict_into_shards
from huggingface_hub import DDUFEntry, get_file_explorer, split_torch_state_dict_into_shards
from packaging import version
from torch import Tensor, nn
from torch.nn import CrossEntropyLoss, Identity
Expand Down Expand Up @@ -491,33 +491,33 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):


def load_state_dict(
checkpoint_file: Union[str, os.PathLike],
checkpoint_file: Union[str, os.PathLike, DDUFEntry],
is_quantized: bool = False,
map_location: Optional[Union[str, torch.device]] = None,
weights_only: bool = True,
dduf_entries=None,
):
"""
Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
"""
if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
# Check format of the archive
if dduf_entries:
# TODO: Find a way to only open the metadata
with dduf_entries[checkpoint_file].as_mmap() as mm:
checkpoint = get_file_explorer(checkpoint_file)
if not checkpoint.is_file():
raise ValueError(
f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
"model. Make sure you have saved the model properly."
)

if checkpoint.file_extension == "safetensors":
if is_safetensors_available():
with checkpoint.as_mmap() as mm:
return safetensors.torch.load(mm)
else:
with safe_open(checkpoint_file, framework="pt") as f:
metadata = f.metadata()
if metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
raise OSError(
f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
"you save your model with the `save_pretrained` method."
)
return safe_load_file(checkpoint_file)
raise ValueError(
f"Cannot load safetensors checkpoint at {checkpoint_file} since safetensors is not installed!"
)

if isinstance(checkpoint_file, DDUFEntry):
raise ValueError(f"Corrupted DDUF Entry: DDUF format only supports safetensors as saving format for model weights. got {checkpoint_file}")

try:
if dduf_entries:
raise ValueError("DDUF format is not supported yet with torch format. Please use safetensors")
if map_location is None:
if (
(
Expand Down Expand Up @@ -3444,7 +3444,6 @@ def from_pretrained(
adapter_name = kwargs.pop("adapter_name", "default")
use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)
generation_config = kwargs.pop("generation_config", None)
dduf_entries = kwargs.pop("dduf_entries", None)

gguf_file = kwargs.pop("gguf_file", None)
# Cache path to the GGUF file
Expand Down Expand Up @@ -3484,14 +3483,9 @@ def from_pretrained(
raise ValueError("accelerate is required when loading a GGUF file `pip install accelerate`.")

if commit_hash is None:
if not isinstance(config, PretrainedConfig):
if dduf_entries:
# files are in an archive, so I'm assuming the commit hash of the archive is enough.
resolved_config_file = next(iter(dduf_entries.items()))[1].dduf_path

else:
# We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
resolved_config_file = cached_file(
commit_hash = getattr(config, "_commit_hash", None)
if commit_hash is None:
resolved_file = pretrained_model_name_or_path if os.path.isfile(pretrained_model_name_or_path) else cached_file(
pretrained_model_name_or_path,
CONFIG_NAME,
cache_dir=cache_dir,
Expand All @@ -3506,33 +3500,28 @@ def from_pretrained(
_raise_exceptions_for_missing_entries=False,
_raise_exceptions_for_connection_errors=False,
)
commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
else:
commit_hash = getattr(config, "_commit_hash", None)
commit_hash = extract_commit_hash(resolved_file, commit_hash)

file_explorer = get_file_explorer(pretrained_model_name_or_path)
if is_peft_available():
_adapter_model_path = adapter_kwargs.pop("_adapter_model_path", None)

if _adapter_model_path is None:
if dduf_entries:
# TODO: use the global var from peft utils
if os.path.join(pretrained_model_name_or_path,"adapter_config.json") in dduf_entries:
_adapter_model_path = os.path.join(pretrained_model_name_or_path, "adapter_config.json")
else:
_adapter_model_path = find_adapter_config_file(
pretrained_model_name_or_path,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
local_files_only=local_files_only,
_commit_hash=commit_hash,
**adapter_kwargs,
)
if _adapter_model_path is not None and os.path.isfile(_adapter_model_path):
with open(_adapter_model_path, "r", encoding="utf-8") as f:
_adapter_model_path = find_adapter_config_file(
file_explorer,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
local_files_only=local_files_only,
_commit_hash=commit_hash,
**adapter_kwargs,
)
if _adapter_model_path is not None:
_adapter_file_explorer = get_file_explorer(_adapter_file_explorer)
if _adapter_file_explorer.is_file():
_adapter_model_path = pretrained_model_name_or_path
pretrained_model_name_or_path = json.load(f)["base_model_name_or_path"]
pretrained_model_name_or_path = json.loads(_adapter_file_explorer.read_text())["base_model_name_or_path"]
else:
_adapter_model_path = None

Expand Down Expand Up @@ -3615,7 +3604,6 @@ def from_pretrained(
subfolder=subfolder,
_from_auto=from_auto_class,
_from_pipeline=from_pipeline,
dduf_entries=dduf_entries,
**kwargs,
)
else:
Expand Down Expand Up @@ -3682,101 +3670,56 @@ def from_pretrained(
"You cannot combine Quantization and loading a model from a GGUF file, try again by making sure you did not passed a `quantization_config` or that you did not load a quantized model from the Hub."
)
if pretrained_model_name_or_path is not None and gguf_file is None:
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
# passing a dduf_entries means that we already knows where the file
is_local = os.path.isdir(pretrained_model_name_or_path) or dduf_entries
file_explorer = get_file_explorer(pretrained_model_name_or_path)
is_local = file_explorer.is_dir()
if is_local:
if from_tf and (
os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index"))
or os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
in dduf_entries
):
if from_tf and file_explorer.navigate_to(subfolder, TF_WEIGHTS_NAME + ".index").is_file():
# Load from a TF 1.0 checkpoint in priority if from_tf
archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
archive_file = file_explorer.navigate_to(subfolder, TF_WEIGHTS_NAME + ".index")
elif from_tf and (
os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME))
or os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME) in dduf_entries
file_explorer.navigate_to(subfolder, TF2_WEIGHTS_NAME + ".index").is_file()
):
# Load from a TF 2.0 checkpoint in priority if from_tf
archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)
archive_file=file_explorer.navigate_to(subfolder, TF2_WEIGHTS_NAME + ".index")
elif from_flax and (
os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME))
or os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME) in dduf_entries
file_explorer.navigate_to(subfolder, FLAX_WEIGHTS_NAME).is_file()
):
# Load from a Flax checkpoint in priority if from_flax
archive_file = os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
archive_file = file_explorer.navigate_to(subfolder, FLAX_WEIGHTS_NAME)
elif use_safetensors is not False and (
os.path.isfile(
os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant)
)
)
or os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant))
in dduf_entries
file_explorer.navigate_to(subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant)).is_file()
):
# Load from a safetensors checkpoint
archive_file = os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant)
)
archive_file = file_explorer.navigate_to(subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant))
elif use_safetensors is not False and (
os.path.isfile(
os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
)
)
or os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
)
in dduf_entries
file_explorer.navigate_to(subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)).is_file()
):
# Load from a sharded safetensors checkpoint
archive_file = os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
)
file_explorer.navigate_to(subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant))
is_sharded = True
elif not use_safetensors and (
os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant))
)
or os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant))
in dduf_entries
file_explorer.navigate_to(subfolder, _add_variant(WEIGHTS_NAME, variant)).is_file()
):
# Load from a PyTorch checkpoint
archive_file = os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)
)
archive_file = file_explorer.navigate_to(subfolder, _add_variant(WEIGHTS_NAME, variant))
elif not use_safetensors and (
os.path.isfile(
os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
)
)
or os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
)
in dduf_entries
file_explorer.navigate_to(subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)).is_file()
):
# Load from a sharded PyTorch checkpoint
archive_file = os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
)
file_explorer.navigate_to(subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant))
is_sharded = True
# At this stage we don't have a weight file so we will raise an error.
elif not use_safetensors and (
os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index"))
or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME))
or os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
in dduf_entries
or os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME) in dduf_entries
file_explorer.navigate_to(subfolder, TF_WEIGHTS_NAME + ".index").is_file()
or file_explorer.navigate_to(subfolder, TF2_WEIGHTS_NAME).is_file()
):
raise EnvironmentError(
f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use"
" `from_tf=True` to load this model from those weights."
)
elif not use_safetensors and (
os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME))
or os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME) in dduf_entries
file_explorer.navigate_to(subfolder, FLAX_WEIGHTS_NAME).is_file()
):
raise EnvironmentError(
f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
Expand All @@ -3795,9 +3738,11 @@ def from_pretrained(
f" {pretrained_model_name_or_path}."
)
elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
# TODO: what would it mean in a DDUF environment?
archive_file = pretrained_model_name_or_path
is_local = True
elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path + ".index")):
# TODO: what would it mean in a DDUF environment?
if not from_tf:
raise ValueError(
f"We found a TensorFlow checkpoint at {pretrained_model_name_or_path + '.index'}, please set "
Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
from ...utils import (
cached_file,
extract_commit_hash,
is_g2p_en_available,
is_sentencepiece_available,
is_tokenizers_available,
Expand Down Expand Up @@ -706,10 +705,11 @@ def get_tokenizer_config(
if resolved_config_file is None:
logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
return {}
commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
# TODO: handle this correctly
# commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
commit_hash = None

with open(resolved_config_file, encoding="utf-8") as reader:
result = json.load(reader)
result = json.loads(resolved_config_file.read_text())
result["_commit_hash"] = commit_hash
return result

Expand Down
17 changes: 4 additions & 13 deletions src/transformers/models/clip/tokenization_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for CLIP."""

import json
import os
import unicodedata
from functools import lru_cache
from typing import List, Optional, Tuple

import regex as re
from huggingface_hub import get_file_explorer

from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
Expand Down Expand Up @@ -291,8 +291,6 @@ def __init__(
pad_token="<|endoftext|>", # hack to enable padding
**kwargs,
):
dduf_entries = kwargs.get("dduf_entries", None)

bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
Expand All @@ -304,20 +302,13 @@ def __init__(
logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False)
self.fix_text = None
if dduf_entries:
self.encoder = json.loads(dduf_entries[vocab_file].read_text())
else:
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)

self.encoder = json.loads(get_file_explorer(vocab_file).read_text(encoding="utf-8"))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
if dduf_entries:
bpe_merges = dduf_entries[merges_file].read_text().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
else:
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
bpe_merges = get_file_explorer(merges_file).read_text().strip().split("\n")[1 : 49152 - 256 - 2 + 1]

bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
Expand Down
Loading
Loading