Skip to content

Commit 470af8a

Browse files
authored
Fix import error for Huggingface-hub version >=0.26.0 & Update Notebooks (#750)
This PR solves an import error, which causes Adapters to not work with huggingface_hub version >= 0.26. This import error comes from HuggingFace removing deprecated functions. Additionally, I updated all our example notebooks because they were outdated and used the `use_auth_token` parameter that isn't used by Hugging Face anymore. ## Context for Why This Error Occurred To download files from repositories, we have the `download_cached(...)` and `get_from_cache(...)` functions in src/adapters/utils.py. Hugging Face has used similar methods in the past, too - our get_from_cache is copied from them. Now, Hugging Face uses a new method: `hf_hub_download(...)` (huggingface_hub/file_download.py) However, we can't use the new `hf_hub_download(...)` function because it has the validate_hf_hub_args decorator that checks if the arguments are valid. This includes checking if the URL is a Hugging Face repo. Our function, however, also needs to download files from GitHub since we have the mapping between the old AdapterHub adapter paths to the new Hugging Face Hub paths in files that are in GitHub. Therefore, we cannot use the new method and have these methods still copied.
1 parent 6fefc9a commit 470af8a

File tree

11 files changed

+814
-327
lines changed

11 files changed

+814
-327
lines changed

examples/pytorch/language-modeling/run_clm.py

Lines changed: 114 additions & 54 deletions
Large diffs are not rendered by default.

examples/pytorch/language-modeling/run_mlm.py

Lines changed: 136 additions & 57 deletions
Large diffs are not rendered by default.

examples/pytorch/multiple-choice/run_swag.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@
4545
)
4646
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
4747
from transformers.trainer_utils import get_last_checkpoint
48-
from transformers.utils import PaddingStrategy, check_min_version
48+
from transformers.utils import PaddingStrategy, check_min_version, send_example_telemetry
4949

5050

5151
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
52-
check_min_version("4.26.0")
52+
check_min_version("4.44.0")
5353

5454
logger = logging.getLogger(__name__)
5555

@@ -94,9 +94,9 @@ class ModelArguments:
9494
default=False,
9595
metadata={
9696
"help": (
97-
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
98-
" should only be set to `True` for repositories you trust and in which you have read the code, as it"
99-
" will execute code present on the Hub on your local machine."
97+
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
98+
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
99+
"execute code present on the Hub on your local machine."
100100
)
101101
},
102102
)
@@ -239,6 +239,10 @@ def main():
239239
else:
240240
model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses()
241241

242+
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
243+
# information sent is the one passed as arguments along with your Python/PyTorch versions.
244+
send_example_telemetry("run_swag", model_args, data_args)
245+
242246
# Setup logging
243247
logging.basicConfig(
244248
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -260,8 +264,7 @@ def main():
260264
# Log on each process the small summary:
261265
logger.warning(
262266
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
263-
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training:"
264-
f" {training_args.fp16}"
267+
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
265268
)
266269
logger.info(f"Training/evaluation parameters {training_args}")
267270

examples/pytorch/question-answering/run_qa.py

Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import logging
2222
import os
2323
import sys
24+
import warnings
2425
from dataclasses import dataclass, field
2526
from typing import Optional
2627

@@ -45,13 +46,13 @@
4546
set_seed,
4647
)
4748
from transformers.trainer_utils import get_last_checkpoint
48-
from transformers.utils import check_min_version
49+
from transformers.utils import check_min_version, send_example_telemetry
4950
from transformers.utils.versions import require_version
5051
from utils_qa import postprocess_qa_predictions
5152

5253

5354
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
54-
check_min_version("4.26.0")
55+
check_min_version("4.44.0")
5556

5657
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
5758

@@ -81,12 +82,22 @@ class ModelArguments:
8182
default="main",
8283
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
8384
)
84-
use_auth_token: bool = field(
85+
token: str = field(
86+
default=None,
87+
metadata={
88+
"help": (
89+
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
90+
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
91+
)
92+
},
93+
)
94+
trust_remote_code: bool = field(
8595
default=False,
8696
metadata={
8797
"help": (
88-
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
89-
"with private models)."
98+
"Whether to trust the execution of code from datasets/models defined on the Hub."
99+
" This option should only be set to `True` for repositories you trust and in which you have read the"
100+
" code, as it will execute code present on the Hub on your local machine."
90101
)
91102
},
92103
)
@@ -231,13 +242,21 @@ def main():
231242
else:
232243
model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses()
233244

245+
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
246+
# information sent is the one passed as arguments along with your Python/PyTorch versions.
247+
send_example_telemetry("run_qa", model_args, data_args)
248+
234249
# Setup logging
235250
logging.basicConfig(
236251
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
237252
datefmt="%m/%d/%Y %H:%M:%S",
238253
handlers=[logging.StreamHandler(sys.stdout)],
239254
)
240255

256+
if training_args.should_log:
257+
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
258+
transformers.utils.logging.set_verbosity_info()
259+
241260
log_level = training_args.get_process_log_level()
242261
logger.setLevel(log_level)
243262
datasets.utils.logging.set_verbosity(log_level)
@@ -247,8 +266,8 @@ def main():
247266

248267
# Log on each process the small summary:
249268
logger.warning(
250-
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
251-
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
269+
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
270+
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
252271
)
253272
logger.info(f"Training/evaluation parameters {training_args}")
254273

@@ -285,7 +304,8 @@ def main():
285304
data_args.dataset_name,
286305
data_args.dataset_config_name,
287306
cache_dir=model_args.cache_dir,
288-
use_auth_token=True if model_args.use_auth_token else None,
307+
token=model_args.token,
308+
trust_remote_code=model_args.trust_remote_code,
289309
)
290310
else:
291311
data_files = {}
@@ -304,10 +324,10 @@ def main():
304324
data_files=data_files,
305325
field="data",
306326
cache_dir=model_args.cache_dir,
307-
use_auth_token=True if model_args.use_auth_token else None,
327+
token=model_args.token,
308328
)
309329
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
310-
# https://huggingface.co/docs/datasets/loading_datasets.html.
330+
# https://huggingface.co/docs/datasets/loading_datasets.
311331

312332
# Load pretrained model and tokenizer
313333
#
@@ -318,25 +338,27 @@ def main():
318338
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
319339
cache_dir=model_args.cache_dir,
320340
revision=model_args.model_revision,
321-
use_auth_token=True if model_args.use_auth_token else None,
341+
token=model_args.token,
342+
trust_remote_code=model_args.trust_remote_code,
322343
)
323344
tokenizer = AutoTokenizer.from_pretrained(
324345
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
325346
cache_dir=model_args.cache_dir,
326347
use_fast=True,
327348
revision=model_args.model_revision,
328-
use_auth_token=True if model_args.use_auth_token else None,
349+
token=model_args.token,
350+
trust_remote_code=model_args.trust_remote_code,
329351
)
330352
model = AutoModelForQuestionAnswering.from_pretrained(
331353
model_args.model_name_or_path,
332354
from_tf=bool(".ckpt" in model_args.model_name_or_path),
333355
config=config,
334356
cache_dir=model_args.cache_dir,
335357
revision=model_args.model_revision,
336-
use_auth_token=True if model_args.use_auth_token else None,
358+
token=model_args.token,
359+
trust_remote_code=model_args.trust_remote_code,
337360
)
338361

339-
# Convert the model into an adapter model
340362
adapters.init(model)
341363

342364
# Tokenizer check: this script requires a fast tokenizer.
@@ -348,7 +370,7 @@ def main():
348370
)
349371

350372
# Preprocessing the datasets.
351-
# Preprocessing is slighlty different for training and evaluation.
373+
# Preprocessing is slightly different for training and evaluation.
352374
if training_args.do_train:
353375
column_names = raw_datasets["train"].column_names
354376
elif training_args.do_eval:
@@ -364,7 +386,7 @@ def main():
364386

365387
if data_args.max_seq_length > tokenizer.model_max_length:
366388
logger.warning(
367-
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
389+
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
368390
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
369391
)
370392
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
@@ -404,7 +426,12 @@ def prepare_train_features(examples):
404426
for i, offsets in enumerate(offset_mapping):
405427
# We will label impossible answers with the index of the CLS token.
406428
input_ids = tokenized_examples["input_ids"][i]
407-
cls_index = input_ids.index(tokenizer.cls_token_id)
429+
if tokenizer.cls_token_id in input_ids:
430+
cls_index = input_ids.index(tokenizer.cls_token_id)
431+
elif tokenizer.bos_token_id in input_ids:
432+
cls_index = input_ids.index(tokenizer.bos_token_id)
433+
else:
434+
cls_index = 0
408435

409436
# Grab the sequence corresponding to that example (to know what is the context and what is the question).
410437
sequence_ids = tokenized_examples.sequence_ids(i)
@@ -589,21 +616,32 @@ def post_processing_function(examples, features, predictions, stage="eval"):
589616
# Format the result to the format the metric expects.
590617
if data_args.version_2_with_negative:
591618
formatted_predictions = [
592-
{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
619+
{"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
593620
]
594621
else:
595-
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
622+
formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()]
596623

597-
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
624+
references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples]
598625
return EvalPrediction(predictions=formatted_predictions, label_ids=references)
599626

600-
metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")
627+
if data_args.version_2_with_negative:
628+
accepted_best_metrics = ("exact", "f1", "HasAns_exact", "HasAns_f1")
629+
else:
630+
accepted_best_metrics = ("exact_match", "f1")
631+
632+
if training_args.load_best_model_at_end and training_args.metric_for_best_model not in accepted_best_metrics:
633+
warnings.warn(f"--metric_for_best_model should be set to one of {accepted_best_metrics}")
634+
635+
metric = evaluate.load(
636+
"squad_v2" if data_args.version_2_with_negative else "squad", cache_dir=model_args.cache_dir
637+
)
601638

602639
def compute_metrics(p: EvalPrediction):
603640
return metric.compute(predictions=p.predictions, references=p.label_ids)
604641

605642
# Setup adapters
606643
setup_adapter_training(model, adapter_args, data_args.dataset_name or "squad")
644+
607645
# Initialize our Trainer
608646
trainer_class = QuestionAnsweringAdapterTrainer if adapter_args.train_adapter else QuestionAnsweringTrainer
609647
trainer = trainer_class(

0 commit comments

Comments
 (0)