2121import logging
2222import os
2323import sys
24+ import warnings
2425from dataclasses import dataclass , field
2526from typing import Optional
2627
4546 set_seed ,
4647)
4748from transformers .trainer_utils import get_last_checkpoint
48- from transformers .utils import check_min_version
49+ from transformers .utils import check_min_version , send_example_telemetry
4950from transformers .utils .versions import require_version
5051from utils_qa import postprocess_qa_predictions
5152
5253
5354# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
54- check_min_version ("4.26 .0" )
55+ check_min_version ("4.44 .0" )
5556
5657require_version ("datasets>=1.8.0" , "To fix: pip install -r examples/pytorch/question-answering/requirements.txt" )
5758
@@ -81,12 +82,22 @@ class ModelArguments:
8182 default = "main" ,
8283 metadata = {"help" : "The specific model version to use (can be a branch name, tag name or commit id)." },
8384 )
84- use_auth_token : bool = field (
85+ token : str = field (
86+ default = None ,
87+ metadata = {
88+ "help" : (
89+ "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
90+ "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
91+ )
92+ },
93+ )
94+ trust_remote_code : bool = field (
8595 default = False ,
8696 metadata = {
8797 "help" : (
88- "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
89- "with private models)."
98+ "Whether to trust the execution of code from datasets/models defined on the Hub."
99+ " This option should only be set to `True` for repositories you trust and in which you have read the"
100+ " code, as it will execute code present on the Hub on your local machine."
90101 )
91102 },
92103 )
@@ -231,13 +242,21 @@ def main():
231242 else :
232243 model_args , data_args , training_args , adapter_args = parser .parse_args_into_dataclasses ()
233244
245+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
246+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
247+ send_example_telemetry ("run_qa" , model_args , data_args )
248+
234249 # Setup logging
235250 logging .basicConfig (
236251 format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s" ,
237252 datefmt = "%m/%d/%Y %H:%M:%S" ,
238253 handlers = [logging .StreamHandler (sys .stdout )],
239254 )
240255
256+ if training_args .should_log :
257+ # The default of training_args.log_level is passive, so we set log level at info here to have that default.
258+ transformers .utils .logging .set_verbosity_info ()
259+
241260 log_level = training_args .get_process_log_level ()
242261 logger .setLevel (log_level )
243262 datasets .utils .logging .set_verbosity (log_level )
@@ -247,8 +266,8 @@ def main():
247266
248267 # Log on each process the small summary:
249268 logger .warning (
250- f"Process rank: { training_args .local_rank } , device: { training_args .device } , n_gpu: { training_args .n_gpu } "
251- + f"distributed training: { bool ( training_args .local_rank != - 1 ) } , 16-bits training: { training_args .fp16 } "
269+ f"Process rank: { training_args .local_rank } , device: { training_args .device } , n_gpu: { training_args .n_gpu } , "
270+ + f"distributed training: { training_args .parallel_mode . value == 'distributed' } , 16-bits training: { training_args .fp16 } "
252271 )
253272 logger .info (f"Training/evaluation parameters { training_args } " )
254273
@@ -285,7 +304,8 @@ def main():
285304 data_args .dataset_name ,
286305 data_args .dataset_config_name ,
287306 cache_dir = model_args .cache_dir ,
288- use_auth_token = True if model_args .use_auth_token else None ,
307+ token = model_args .token ,
308+ trust_remote_code = model_args .trust_remote_code ,
289309 )
290310 else :
291311 data_files = {}
@@ -304,10 +324,10 @@ def main():
304324 data_files = data_files ,
305325 field = "data" ,
306326 cache_dir = model_args .cache_dir ,
307- use_auth_token = True if model_args .use_auth_token else None ,
327+ token = model_args .token ,
308328 )
309329 # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
310- # https://huggingface.co/docs/datasets/loading_datasets.html.
330+ # https://huggingface.co/docs/datasets/loading_datasets.
311331
312332 # Load pretrained model and tokenizer
313333 #
@@ -318,25 +338,27 @@ def main():
318338 model_args .config_name if model_args .config_name else model_args .model_name_or_path ,
319339 cache_dir = model_args .cache_dir ,
320340 revision = model_args .model_revision ,
321- use_auth_token = True if model_args .use_auth_token else None ,
341+ token = model_args .token ,
342+ trust_remote_code = model_args .trust_remote_code ,
322343 )
323344 tokenizer = AutoTokenizer .from_pretrained (
324345 model_args .tokenizer_name if model_args .tokenizer_name else model_args .model_name_or_path ,
325346 cache_dir = model_args .cache_dir ,
326347 use_fast = True ,
327348 revision = model_args .model_revision ,
328- use_auth_token = True if model_args .use_auth_token else None ,
349+ token = model_args .token ,
350+ trust_remote_code = model_args .trust_remote_code ,
329351 )
330352 model = AutoModelForQuestionAnswering .from_pretrained (
331353 model_args .model_name_or_path ,
332354 from_tf = bool (".ckpt" in model_args .model_name_or_path ),
333355 config = config ,
334356 cache_dir = model_args .cache_dir ,
335357 revision = model_args .model_revision ,
336- use_auth_token = True if model_args .use_auth_token else None ,
358+ token = model_args .token ,
359+ trust_remote_code = model_args .trust_remote_code ,
337360 )
338361
339- # Convert the model into an adapter model
340362 adapters .init (model )
341363
342364 # Tokenizer check: this script requires a fast tokenizer.
@@ -348,7 +370,7 @@ def main():
348370 )
349371
350372 # Preprocessing the datasets.
351- # Preprocessing is slighlty different for training and evaluation.
373+ # Preprocessing is slightly different for training and evaluation.
352374 if training_args .do_train :
353375 column_names = raw_datasets ["train" ].column_names
354376 elif training_args .do_eval :
@@ -364,7 +386,7 @@ def main():
364386
365387 if data_args .max_seq_length > tokenizer .model_max_length :
366388 logger .warning (
367- f"The max_seq_length passed ({ data_args .max_seq_length } ) is larger than the maximum length for the"
389+ f"The max_seq_length passed ({ data_args .max_seq_length } ) is larger than the maximum length for the "
368390 f"model ({ tokenizer .model_max_length } ). Using max_seq_length={ tokenizer .model_max_length } ."
369391 )
370392 max_seq_length = min (data_args .max_seq_length , tokenizer .model_max_length )
@@ -404,7 +426,12 @@ def prepare_train_features(examples):
404426 for i , offsets in enumerate (offset_mapping ):
405427 # We will label impossible answers with the index of the CLS token.
406428 input_ids = tokenized_examples ["input_ids" ][i ]
407- cls_index = input_ids .index (tokenizer .cls_token_id )
429+ if tokenizer .cls_token_id in input_ids :
430+ cls_index = input_ids .index (tokenizer .cls_token_id )
431+ elif tokenizer .bos_token_id in input_ids :
432+ cls_index = input_ids .index (tokenizer .bos_token_id )
433+ else :
434+ cls_index = 0
408435
409436 # Grab the sequence corresponding to that example (to know what is the context and what is the question).
410437 sequence_ids = tokenized_examples .sequence_ids (i )
@@ -589,21 +616,32 @@ def post_processing_function(examples, features, predictions, stage="eval"):
589616 # Format the result to the format the metric expects.
590617 if data_args .version_2_with_negative :
591618 formatted_predictions = [
592- {"id" : k , "prediction_text" : v , "no_answer_probability" : 0.0 } for k , v in predictions .items ()
619+ {"id" : str ( k ) , "prediction_text" : v , "no_answer_probability" : 0.0 } for k , v in predictions .items ()
593620 ]
594621 else :
595- formatted_predictions = [{"id" : k , "prediction_text" : v } for k , v in predictions .items ()]
622+ formatted_predictions = [{"id" : str ( k ) , "prediction_text" : v } for k , v in predictions .items ()]
596623
597- references = [{"id" : ex ["id" ], "answers" : ex [answer_column_name ]} for ex in examples ]
624+ references = [{"id" : str ( ex ["id" ]) , "answers" : ex [answer_column_name ]} for ex in examples ]
598625 return EvalPrediction (predictions = formatted_predictions , label_ids = references )
599626
600- metric = evaluate .load ("squad_v2" if data_args .version_2_with_negative else "squad" )
627+ if data_args .version_2_with_negative :
628+ accepted_best_metrics = ("exact" , "f1" , "HasAns_exact" , "HasAns_f1" )
629+ else :
630+ accepted_best_metrics = ("exact_match" , "f1" )
631+
632+ if training_args .load_best_model_at_end and training_args .metric_for_best_model not in accepted_best_metrics :
633+ warnings .warn (f"--metric_for_best_model should be set to one of { accepted_best_metrics } " )
634+
635+ metric = evaluate .load (
636+ "squad_v2" if data_args .version_2_with_negative else "squad" , cache_dir = model_args .cache_dir
637+ )
601638
602639 def compute_metrics (p : EvalPrediction ):
603640 return metric .compute (predictions = p .predictions , references = p .label_ids )
604641
605642 # Setup adapters
606643 setup_adapter_training (model , adapter_args , data_args .dataset_name or "squad" )
644+
607645 # Initialize our Trainer
608646 trainer_class = QuestionAnsweringAdapterTrainer if adapter_args .train_adapter else QuestionAnsweringTrainer
609647 trainer = trainer_class (
0 commit comments