diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst index ac81fbb7..8deabab6 100644 --- a/CHANGELOGS.rst +++ b/CHANGELOGS.rst @@ -4,6 +4,8 @@ Change Logs 0.7.1 +++++ +* :pr:`159`: supports for models with custom code in huggingface +* :pr:`158`: fix uses of pretrained version * :pr:`156`, :pr:`157`: add plots and other options to deal with the unpredictable * :pr:`155`: better aggregation of historical data * :pr:`151`, :pr:`153`: adds command line ``agg``, class CubeLogsPerformance to produce timeseries diff --git a/_unittests/ut_torch_models/test_hghub_api.py b/_unittests/ut_torch_models/test_hghub_api.py index 3766560b..10a9689e 100644 --- a/_unittests/ut_torch_models/test_hghub_api.py +++ b/_unittests/ut_torch_models/test_hghub_api.py @@ -1,5 +1,7 @@ +import os import unittest import pandas +import transformers from onnx_diagnostic.ext_test_case import ( ExtTestCase, hide_stdout, @@ -13,6 +15,7 @@ enumerate_model_list, get_model_info, get_pretrained_config, + download_code_modelid, task_from_id, task_from_arch, task_from_tags, @@ -147,6 +150,29 @@ def test__ccached_config_64(self): conf = _ccached_hf_internal_testing_tiny_random_beitforimageclassification() self.assertEqual(conf.auxiliary_channels, 256) + @requires_transformers("4.50") + @requires_torch("2.7") + @ignore_errors(OSError) # connectivity issues + @hide_stdout() + def test_download_code_modelid(self): + model_id = "microsoft/Phi-3.5-MoE-instruct" + files = download_code_modelid(model_id, verbose=1, add_path_to_sys_path=True) + self.assertTrue(all(os.path.exists(f) for f in files)) + pyf = [os.path.split(name)[-1] for name in files] + self.assertEqual( + ["configuration_phimoe.py", "modeling_phimoe.py", "sample_finetune.py"], pyf + ) + try: + cls = transformers.dynamic_module_utils.get_class_from_dynamic_module( + "modeling_phimoe.PhiMoERotaryEmbedding", + pretrained_model_name_or_path=os.path.split(files[0])[0], + ) + except ImportError as e: + if "flash_attn" in str(e): + raise unittest.SkipTest("missing package {e}") + raise + self.assertEqual(cls.__name__, "PhiMoERotaryEmbedding") + if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/onnx_diagnostic/torch_models/hghub/hub_api.py b/onnx_diagnostic/torch_models/hghub/hub_api.py index b78b093f..4d82ea92 100644 --- a/onnx_diagnostic/torch_models/hghub/hub_api.py +++ b/onnx_diagnostic/torch_models/hghub/hub_api.py @@ -3,9 +3,10 @@ import json import os import pprint +import sys from typing import Any, Dict, List, Optional, Union import transformers -from huggingface_hub import HfApi, model_info, hf_hub_download +from huggingface_hub import HfApi, model_info, hf_hub_download, list_repo_files from ...helpers.config_helper import update_config from . import hub_data_cached_configs from .hub_data import __date__, __data_tasks__, load_architecture_task, __data_arch_values__ @@ -327,3 +328,43 @@ def enumerate_model_list( n -= 1 if n == 0: break + + +def download_code_modelid( + model_id: str, verbose: int = 0, add_path_to_sys_path: bool = True +) -> List[str]: + """ + Downloads the code for a given model id. + + :param model_id: model id + :param verbose: verbosity + :param add_path_to_sys_path: add folder where the files are downloaded to sys.path + :return: list of downloaded files + """ + if verbose: + print(f"[download_code_modelid] retrieve file list for {model_id!r}") + files = list_repo_files(model_id) + pyfiles = [name for name in files if os.path.splitext(name)[-1] == ".py"] + if verbose: + print(f"[download_code_modelid] python files {pyfiles}") + absfiles = [] + paths = set() + for i, name in enumerate(pyfiles): + if verbose: + print(f"[download_code_modelid] download file {i+1}/{len(pyfiles)}: {name!r}") + r = hf_hub_download(repo_id=model_id, filename=name) + p = os.path.split(r)[0] + paths.add(p) + absfiles.append(r) + if add_path_to_sys_path: + for p in paths: + init = os.path.join(p, "__init__.py") + if not os.path.exists(init): + with open(init, "w"): + pass + if p in sys.path: + continue + if verbose: + print(f"[download_code_modelid] add {p!r} to 'sys.path'") + sys.path.insert(0, p) + return absfiles diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py index 30448fda..1f8d89ed 100644 --- a/onnx_diagnostic/torch_models/hghub/model_inputs.py +++ b/onnx_diagnostic/torch_models/hghub/model_inputs.py @@ -6,7 +6,7 @@ import transformers from ...helpers.config_helper import update_config from ...tasks import reduce_model_config, random_input_kwargs -from .hub_api import task_from_arch, task_from_id, get_pretrained_config +from .hub_api import task_from_arch, task_from_id, get_pretrained_config, download_code_modelid def _code_needing_rewriting(model: Any) -> Any: @@ -149,7 +149,41 @@ def get_untrained_model_with_inputs( model = transformers.AutoModel.from_pretrained(model_id, **mkwargs) else: if archs is not None: - model = getattr(transformers, archs[0])(config) + try: + model = getattr(transformers, archs[0])(config) + except AttributeError as e: + # The code of the models is not in transformers but in the + # repository of the model. We need to download it. + pyfiles = download_code_modelid(model_id, verbose=verbose) + if pyfiles: + if "." in archs[0]: + cls_name = archs[0] + else: + modeling = [_ for _ in pyfiles if "/modeling_" in _] + assert len(modeling) == 1, ( + f"Unable to guess the main file implemented class {archs[0]!r} " + f"from {pyfiles}, found={modeling}." + ) + last_name = os.path.splitext(os.path.split(modeling[0])[-1])[0] + cls_name = f"{last_name}.{archs[0]}" + if verbose: + print( + f"[get_untrained_model_with_inputs] custom code for {cls_name!r}" + ) + print( + f"[get_untrained_model_with_inputs] from folder " + f"{os.path.split(pyfiles[0])[0]!r}" + ) + cls = transformers.dynamic_module_utils.get_class_from_dynamic_module( + cls_name, pretrained_model_name_or_path=os.path.split(pyfiles[0])[0] + ) + model = cls(config) + else: + raise AttributeError( + f"Unable to find class 'tranformers.{archs[0]}'. " + f"The code needs to be downloaded, config=" + f"\n{pprint.pformat(config)}." + ) from e else: assert same_as_pretrained and use_pretrained, ( f"Model {model_id!r} cannot be built, the model cannot be built. "