Skip to content

Commit 158ff7d

Browse files
Copilotthinkall
andauthored
Fix transformers API compatibility: support v4.26+ and v5.0+ with version-aware parameter selection (#1514)
* Initial plan * Fix transformers API compatibility issues Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add backward compatibility for transformers v4.26+ by version check Support both tokenizer (v4.26-4.43) and processing_class (v4.44+) parameters based on installed transformers version. Fallback to tokenizer if version check fails. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Improve exception handling specificity Use specific exception types (ImportError, AttributeError, ValueError) instead of broad Exception catch for better error handling. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Run pre-commit formatting on all files Applied black formatting to fix code style across the repository. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
1 parent a502115 commit 158ff7d

File tree

2 files changed

+43
-29
lines changed

2 files changed

+43
-29
lines changed

flaml/automl/model.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,16 +1196,31 @@ def on_epoch_end(self, args, state, control, **callback_kwargs):
11961196
control.should_save = True
11971197
control.should_evaluate = True
11981198

1199-
self._trainer = TrainerForAuto(
1200-
args=self._training_args,
1201-
model_init=self._model_init,
1202-
train_dataset=train_dataset,
1203-
eval_dataset=eval_dataset,
1204-
tokenizer=self.tokenizer,
1205-
data_collator=self.data_collator,
1206-
compute_metrics=self._compute_metrics_by_dataset_name,
1207-
callbacks=[EarlyStoppingCallbackForAuto],
1208-
)
1199+
# Use processing_class for transformers >= 4.44.0, tokenizer for older versions
1200+
trainer_kwargs = {
1201+
"args": self._training_args,
1202+
"model_init": self._model_init,
1203+
"train_dataset": train_dataset,
1204+
"eval_dataset": eval_dataset,
1205+
"data_collator": self.data_collator,
1206+
"compute_metrics": self._compute_metrics_by_dataset_name,
1207+
"callbacks": [EarlyStoppingCallbackForAuto],
1208+
}
1209+
1210+
# Check if processing_class parameter is supported (transformers >= 4.44.0)
1211+
try:
1212+
import transformers
1213+
from packaging import version
1214+
1215+
if version.parse(transformers.__version__) >= version.parse("4.44.0"):
1216+
trainer_kwargs["processing_class"] = self.tokenizer
1217+
else:
1218+
trainer_kwargs["tokenizer"] = self.tokenizer
1219+
except (ImportError, AttributeError, ValueError):
1220+
# Fallback to tokenizer if version check fails
1221+
trainer_kwargs["tokenizer"] = self.tokenizer
1222+
1223+
self._trainer = TrainerForAuto(**trainer_kwargs)
12091224

12101225
if self._task in NLG_TASKS:
12111226
setattr(self._trainer, "_is_seq2seq", True)

flaml/automl/nlp/huggingface/utils.py

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -211,29 +211,28 @@ def tokenize_onedataframe(
211211
hf_args=None,
212212
prefix_str=None,
213213
):
214-
with tokenizer.as_target_tokenizer():
215-
_, tokenized_column_names = tokenize_row(
216-
dict(X.iloc[0]),
214+
_, tokenized_column_names = tokenize_row(
215+
dict(X.iloc[0]),
216+
tokenizer,
217+
prefix=(prefix_str,) if task is SUMMARIZATION else None,
218+
task=task,
219+
hf_args=hf_args,
220+
return_column_name=True,
221+
)
222+
d = X.apply(
223+
lambda x: tokenize_row(
224+
x,
217225
tokenizer,
218226
prefix=(prefix_str,) if task is SUMMARIZATION else None,
219227
task=task,
220228
hf_args=hf_args,
221-
return_column_name=True,
222-
)
223-
d = X.apply(
224-
lambda x: tokenize_row(
225-
x,
226-
tokenizer,
227-
prefix=(prefix_str,) if task is SUMMARIZATION else None,
228-
task=task,
229-
hf_args=hf_args,
230-
),
231-
axis=1,
232-
result_type="expand",
233-
)
234-
X_tokenized = pd.DataFrame(columns=tokenized_column_names)
235-
X_tokenized[tokenized_column_names] = d
236-
return X_tokenized
229+
),
230+
axis=1,
231+
result_type="expand",
232+
)
233+
X_tokenized = pd.DataFrame(columns=tokenized_column_names)
234+
X_tokenized[tokenized_column_names] = d
235+
return X_tokenized
237236

238237

239238
def tokenize_row(

0 commit comments

Comments
 (0)