Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOGS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Change Logs
0.7.1
+++++

* :pr:`159`: supports for models with custom code in huggingface
* :pr:`158`: fix uses of pretrained version
* :pr:`156`, :pr:`157`: add plots and other options to deal with the unpredictable
* :pr:`155`: better aggregation of historical data
* :pr:`151`, :pr:`153`: adds command line ``agg``, class CubeLogsPerformance to produce timeseries
Expand Down
26 changes: 26 additions & 0 deletions _unittests/ut_torch_models/test_hghub_api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import unittest
import pandas
import transformers
from onnx_diagnostic.ext_test_case import (
ExtTestCase,
hide_stdout,
Expand All @@ -13,6 +15,7 @@
enumerate_model_list,
get_model_info,
get_pretrained_config,
download_code_modelid,
task_from_id,
task_from_arch,
task_from_tags,
Expand Down Expand Up @@ -147,6 +150,29 @@ def test__ccached_config_64(self):
conf = _ccached_hf_internal_testing_tiny_random_beitforimageclassification()
self.assertEqual(conf.auxiliary_channels, 256)

@requires_transformers("4.50")
@requires_torch("2.7")
@ignore_errors(OSError) # connectivity issues
@hide_stdout()
def test_download_code_modelid(self):
model_id = "microsoft/Phi-3.5-MoE-instruct"
files = download_code_modelid(model_id, verbose=1, add_path_to_sys_path=True)
self.assertTrue(all(os.path.exists(f) for f in files))
pyf = [os.path.split(name)[-1] for name in files]
self.assertEqual(
["configuration_phimoe.py", "modeling_phimoe.py", "sample_finetune.py"], pyf
)
try:
cls = transformers.dynamic_module_utils.get_class_from_dynamic_module(
"modeling_phimoe.PhiMoERotaryEmbedding",
pretrained_model_name_or_path=os.path.split(files[0])[0],
)
except ImportError as e:
if "flash_attn" in str(e):
raise unittest.SkipTest("missing package {e}")
raise
self.assertEqual(cls.__name__, "PhiMoERotaryEmbedding")


if __name__ == "__main__":
unittest.main(verbosity=2)
43 changes: 42 additions & 1 deletion onnx_diagnostic/torch_models/hghub/hub_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import json
import os
import pprint
import sys
from typing import Any, Dict, List, Optional, Union
import transformers
from huggingface_hub import HfApi, model_info, hf_hub_download
from huggingface_hub import HfApi, model_info, hf_hub_download, list_repo_files
from ...helpers.config_helper import update_config
from . import hub_data_cached_configs
from .hub_data import __date__, __data_tasks__, load_architecture_task, __data_arch_values__
Expand Down Expand Up @@ -327,3 +328,43 @@ def enumerate_model_list(
n -= 1
if n == 0:
break


def download_code_modelid(
model_id: str, verbose: int = 0, add_path_to_sys_path: bool = True
) -> List[str]:
"""
Downloads the code for a given model id.

:param model_id: model id
:param verbose: verbosity
:param add_path_to_sys_path: add folder where the files are downloaded to sys.path
:return: list of downloaded files
"""
if verbose:
print(f"[download_code_modelid] retrieve file list for {model_id!r}")
files = list_repo_files(model_id)
pyfiles = [name for name in files if os.path.splitext(name)[-1] == ".py"]
if verbose:
print(f"[download_code_modelid] python files {pyfiles}")
absfiles = []
paths = set()
for i, name in enumerate(pyfiles):
if verbose:
print(f"[download_code_modelid] download file {i+1}/{len(pyfiles)}: {name!r}")
r = hf_hub_download(repo_id=model_id, filename=name)
p = os.path.split(r)[0]
paths.add(p)
absfiles.append(r)
if add_path_to_sys_path:
for p in paths:
init = os.path.join(p, "__init__.py")
if not os.path.exists(init):
with open(init, "w"):
pass
if p in sys.path:
continue
if verbose:
print(f"[download_code_modelid] add {p!r} to 'sys.path'")
sys.path.insert(0, p)
return absfiles
38 changes: 36 additions & 2 deletions onnx_diagnostic/torch_models/hghub/model_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import transformers
from ...helpers.config_helper import update_config
from ...tasks import reduce_model_config, random_input_kwargs
from .hub_api import task_from_arch, task_from_id, get_pretrained_config
from .hub_api import task_from_arch, task_from_id, get_pretrained_config, download_code_modelid


def _code_needing_rewriting(model: Any) -> Any:
Expand Down Expand Up @@ -149,7 +149,41 @@ def get_untrained_model_with_inputs(
model = transformers.AutoModel.from_pretrained(model_id, **mkwargs)
else:
if archs is not None:
model = getattr(transformers, archs[0])(config)
try:
model = getattr(transformers, archs[0])(config)
except AttributeError as e:
# The code of the models is not in transformers but in the
# repository of the model. We need to download it.
pyfiles = download_code_modelid(model_id, verbose=verbose)
if pyfiles:
if "." in archs[0]:
cls_name = archs[0]
else:
modeling = [_ for _ in pyfiles if "/modeling_" in _]
assert len(modeling) == 1, (
f"Unable to guess the main file implemented class {archs[0]!r} "
f"from {pyfiles}, found={modeling}."
)
last_name = os.path.splitext(os.path.split(modeling[0])[-1])[0]
cls_name = f"{last_name}.{archs[0]}"
if verbose:
print(
f"[get_untrained_model_with_inputs] custom code for {cls_name!r}"
)
print(
f"[get_untrained_model_with_inputs] from folder "
f"{os.path.split(pyfiles[0])[0]!r}"
)
cls = transformers.dynamic_module_utils.get_class_from_dynamic_module(
cls_name, pretrained_model_name_or_path=os.path.split(pyfiles[0])[0]
)
model = cls(config)
else:
raise AttributeError(
f"Unable to find class 'tranformers.{archs[0]}'. "
f"The code needs to be downloaded, config="
f"\n{pprint.pformat(config)}."
) from e
else:
assert same_as_pretrained and use_pretrained, (
f"Model {model_id!r} cannot be built, the model cannot be built. "
Expand Down
Loading