[compat] Introduce Transformers v5.2 compatibility: trainer _nested_gather moved (#3664)

tomaarsen · tomaarsen · commit 1a6b4e6180b0 · 2026-02-17T14:16:26.000+01:00
* Introduce Transformers v5.2 compatibility: trainer _nested_gather moved

* Replace prajjwal1/bert-tiny due to issues loading with AutoConfig

* Disable the transformers progress bars in the CI

The weight loading progress bars heavily expand the logs
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -13,6 +13,7 @@ on:
 
 env:
   TRANSFORMERS_IS_CI: 1
+  HF_HUB_DISABLE_PROGRESS_BARS: 1  # The Transformers v5 weight loading progress bars heavily expand the logs
 
 jobs:
   test_sampling:
diff --git a/sentence_transformers/trainer.py b/sentence_transformers/trainer.py
@@ -475,7 +475,16 @@ def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
             # If we don't copy the logs, we'll include the loss components in the on_evaluate as well,
             # whereas we prefer to have them only in the on_log
             logs = logs.copy()
-            accum_losses = self._nested_gather(self.accum_loss_components[training_type])
+            # Transformers v4/v5 compatibility: v5.2 moves _nested_gather to `transformers.trainer_pt_utils`,
+            # see https://github.com/huggingface/transformers/pull/43744
+            if hasattr(self, "_nested_gather"):
+                accum_losses = self._nested_gather(self.accum_loss_components[training_type])
+            else:
+                from transformers.trainer_pt_utils import nested_gather
+
+                accum_losses = nested_gather(
+                    self.accum_loss_components[training_type], parallel_mode=self.args.parallel_mode
+                )
             if "steps" in accum_losses:
                 steps = accum_losses.get("steps").sum().item()
                 self.accum_loss_components[training_type]["steps"] *= 0
diff --git a/tests/cross_encoder/test_cross_encoder.py b/tests/cross_encoder/test_cross_encoder.py
@@ -258,7 +258,7 @@ def test_target_device_backwards_compat():
 
 
 def test_num_labels_fresh_model():
-    model = CrossEncoder("prajjwal1/bert-tiny")
+    model = CrossEncoder("sentence-transformers-testing/stsb-bert-tiny-safetensors")
     assert model.num_labels == 1
 
 
@@ -542,7 +542,9 @@ def test_logger_warning(caplog):
     ],
 )
 def test_load_activation_fn_from_kwargs(num_labels: int, activation_fn: str, saved_activation_fn: str, tmp_path: Path):
-    model = CrossEncoder("prajjwal1/bert-tiny", num_labels=num_labels, activation_fn=activation_fn)
+    model = CrossEncoder(
+        "sentence-transformers-testing/stsb-bert-tiny-safetensors", num_labels=num_labels, activation_fn=activation_fn
+    )
     assert fullname(model.activation_fn) == saved_activation_fn
 
     model.save_pretrained(tmp_path)
diff --git a/tests/cross_encoder/test_model_card.py b/tests/cross_encoder/test_model_card.py
@@ -46,7 +46,7 @@ def dummy_dataset():
                 "- sentence-transformers",
                 "- cross-encoder",
                 "pipeline_tag: text-ranking",
-                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny)",
+                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors)",
                 "[sentence-transformers](https://www.SBERT.net) library",
                 "It computes scores for pairs of texts, which can be used for text reranking and semantic search.",
                 "**Maximum Sequence Length:** 512 tokens",
@@ -71,7 +71,7 @@ def dummy_dataset():
                 "- sentence-transformers",
                 "- cross-encoder",
                 "pipeline_tag: text-classification",
-                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny)",
+                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors)",
                 "[sentence-transformers](https://www.SBERT.net) library",
                 "It computes scores for pairs of texts, which can be used for text pair classification.",
                 "**Maximum Sequence Length:** 512 tokens",
@@ -91,15 +91,15 @@ def dummy_dataset():
             1,
             1,
             [
-                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny) on the train_0 dataset using the [sentence-transformers](https://www.SBERT.net) library.",
+                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0 dataset using the [sentence-transformers](https://www.SBERT.net) library.",
                 "#### train_0",
             ],
         ),
         (
             2,
             1,
             [
-                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny) on the train_0 and train_1 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
+                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0 and train_1 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
                 "#### train_0",
                 "#### train_1",
             ],
@@ -108,7 +108,7 @@ def dummy_dataset():
             10,
             1,
             [
-                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
+                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
                 "<details><summary>train_0</summary>",  # We start using <details><summary> if we have more than 3 datasets
                 "#### train_0",
                 "</details>\n<details><summary>train_9</summary>",
@@ -120,7 +120,7 @@ def dummy_dataset():
             50,
             1,
             [
-                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny) on 50 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
+                "This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) on 50 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
                 "<details><summary>train_0</summary>",
                 "#### train_0",
                 "</details>\n<details><summary>train_49</summary>",
@@ -135,7 +135,7 @@ def test_model_card_base(
     num_labels: int,
     expected_substrings: list[str],
 ) -> None:
-    model = CrossEncoder("prajjwal1/bert-tiny", num_labels=num_labels)
+    model = CrossEncoder("sentence-transformers-testing/stsb-bert-tiny-safetensors", num_labels=num_labels)
 
     # Let's avoid requesting the Hub for e.g. checking if a base model exists there
     model.model_card_data.local_files_only = True
diff --git a/tests/sparse_encoder/test_sparse_encoder.py b/tests/sparse_encoder/test_sparse_encoder.py
@@ -472,7 +472,7 @@ def test_detect_mlm():
 def test_default_to_csr():
     # NOTE: bert-tiny is actually MLM-based, but the config isn't modern enough to allow us to detect it,
     # so we should default to CSR here.
-    model = SparseEncoder("prajjwal1/bert-tiny")
+    model = SparseEncoder("sentence-transformers-testing/stsb-bert-tiny-safetensors")
     assert isinstance(model[0], Transformer)
     assert isinstance(model[1], Pooling)
     assert isinstance(model[2], SparseAutoEncoder)