huggingface · burcgokden · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
@@ -109,7 +109,9 @@ def __post_init__(self):
                 self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
             print(f"Using {self.docker_image} docker image")
         if self.install_steps is None:
-            self.install_steps = ["uv venv && uv pip install ."]
+            self.install_steps = ["uv pip install ."]
+        # Use a custom patched pytest to force exit the process at the end, to avoid `Too long with no output (exceeded 10m0s): context deadline exceeded`
+        self.install_steps.append("uv pip install git+https://github.com/ydshieh/[email protected]")
         if self.pytest_options is None:
             self.pytest_options = {}
         if isinstance(self.tests_to_run, str):
@@ -213,7 +215,7 @@ def job_name(self):
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     # networkx==3.3 (after #36957) cause some issues
     # TODO: remove this once it works directly
-    install_steps=["uv venv && uv pip install ."],
+    install_steps=["uv pip install ."],
     marker="generate",
     parallelism=6,
 )
@@ -250,7 +252,7 @@ def job_name(self):
     additional_env={"OMP_NUM_THREADS": 8},
     docker_image=[{"image":"huggingface/transformers-examples-torch"}],
     # TODO @ArthurZucker remove this once docker is easier to build
-    install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
+    install_steps=["uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
     pytest_num_workers=4,
 )
 
@@ -259,7 +261,7 @@ def job_name(self):
     additional_env={"HUGGINGFACE_CO_STAGING": True},
     docker_image=[{"image":"huggingface/transformers-torch-light"}],
     install_steps=[
-        'uv venv && uv pip install .',
+        'uv pip install .',
         'git config --global user.email "[email protected]"',
         'git config --global user.name "ci"',
     ],
@@ -273,7 +275,6 @@ def job_name(self):
     "onnx",
     docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
     install_steps=[
-        "uv venv",
         "uv pip install .[testing,sentencepiece,onnxruntime,vision,rjieba]",
     ],
     pytest_options={"k onnx": None},
@@ -303,7 +304,7 @@ def job_name(self):
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     # networkx==3.3 (after #36957) cause some issues
     # TODO: remove this once it works directly
-    install_steps=["uv venv && uv pip install .[serving]"],
+    install_steps=["uv pip install .[serving]"],
     marker="not generate",
     parallelism=6,
 )
@@ -321,7 +322,7 @@ def job_name(self):
     additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
     install_steps=[
         # Add an empty file to keep the test step running correctly even no file is selected to be tested.
-        "uv venv && pip install .",
+        "uv pip install .",
         "touch dummy.py",
         command,
         "cat pr_documentation_tests_temp.txt",

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -48,7 +48,7 @@ jobs:
 
       - name: Run database init script
         run: |
-          psql -f benchmark/init_db.sql
+          psql -f benchmark/utils/init_db.sql
         env:
           PGDATABASE: metrics
           PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}

diff --git a/.github/workflows/check_failed_tests.yml b/.github/workflows/check_failed_tests.yml
@@ -21,6 +21,9 @@ on:
       report_repo_id:
         required: true
         type: string
+      commit_sha:
+        required: false
+        type: string
 
 
 env:
@@ -87,7 +90,7 @@ jobs:
       - name: Update clone
         working-directory: /transformers
         if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Get target commit
         working-directory: /transformers/utils

diff --git a/.github/workflows/collated-reports.yml b/.github/workflows/collated-reports.yml
@@ -0,0 +1,49 @@
+name: CI collated reports
+
+on:
+  workflow_call:
+    inputs:
+      job:
+        required: true
+        type: string
+      report_repo_id:
+        required: true
+        type: string
+      machine_type:
+        required: true
+        type: string
+      gpu_name:
+        description: Name of the GPU used for the job. Its enough that the value contains the name of the GPU, e.g. "noise-h100-more-noise". Case insensitive.
+        required: true
+        type: string
+
+jobs:
+  collated_reports:
+    name: Collated reports
+    runs-on: ubuntu-22.04
+    if: always()
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/download-artifact@v4
+
+      - name: Collated reports
+        shell: bash
+        env:
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          CI_SHA: ${{ github.sha }}
+          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+        run: |
+          pip install huggingface_hub
+          python3 utils/collated_reports.py                  \
+            --path /transformers/reports/                    \
+            --machine-type ${{ inputs.machine_type }}        \
+            --commit-hash ${{ env.CI_SHA }}                  \
+            --job ${{ inputs.job }}                          \
+            --report-repo-id ${{ inputs.report_repo_id }}    \
+            --gpu-name ${{ inputs.gpu_name }}
+
+      - name: Upload collated reports
+        uses: actions/upload-artifact@v4
+        with:
+          name: collated_reports_${{ env.CI_SHA }}.json
+          path: collated_reports_${{ env.CI_SHA }}.json
diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
@@ -18,6 +18,9 @@ on:
       docker:
         required: true
         type: string
+      commit_sha:
+        required: false
+        type: string
       report_name_prefix:
         required: false
         default: run_models_gpu
@@ -70,7 +73,7 @@ jobs:
 
       - name: Update clone
         working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers

diff --git a/.github/workflows/self-nightly-caller.yml b/.github/workflows/self-nightly-caller.yml
@@ -1,43 +1,32 @@
-name: Self-hosted runner (nightly-ci)
-
+name: Nvidia CI with nightly torch
 
 on:
   repository_dispatch:
-  schedule:
-    - cron: "17 2 * * *"
+  # triggered when the daily scheduled Nvidia CI is completed.
+  # This way, we can compare the results more easily.
+  workflow_run:
+    workflows: ["Nvidia CI"]
+    branches: ["main"]
+    types: [completed]
   push:
     branches:
-      - run_nightly_ci*
+      - run_ci_with_nightly_torch*
 
 jobs:
-  build_nightly_ci_images:
-    name: Build Nightly CI Docker Images
-    if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
+  build_nightly_torch_ci_images:
+    name: Build CI Docker Images with nightly torch
     uses: ./.github/workflows/build-nightly-ci-docker-images.yml
     secrets: inherit
 
   model-ci:
     name: Model CI
-    needs: [build_nightly_ci_images]
+    needs: build_nightly_torch_ci_images
     uses: ./.github/workflows/self-scheduled.yml
     with:
       job: run_models_gpu
       slack_report_channel: "#transformers-ci-past-future"
-      runner: ci
       docker: huggingface/transformers-all-latest-torch-nightly-gpu
       ci_event: Nightly CI
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    needs: [build_nightly_ci_images]
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-past-future"
-      runner: ci
-      # test deepspeed nightly build with the latest release torch
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Nightly CI
-      working-directory-prefix: /workspace
+      report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
+      commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
     secrets: inherit
diff --git a/.github/workflows/self-push-amd-mi300-caller.yml b/.github/workflows/self-push-amd-mi300-caller.yml
diff --git a/.github/workflows/self-scheduled-amd-mi325-caller.yml b/.github/workflows/self-scheduled-amd-mi325-caller.yml
@@ -24,6 +24,7 @@ jobs:
       docker: huggingface/transformers-pytorch-amd-gpu
       ci_event: Scheduled CI (AMD) - mi325
       report_repo_id: optimum-amd/transformers_daily_ci
+      env_file: /etc/podinfo/gha-gpu-isolation-settings
     secrets: inherit
 
   torch-pipeline:
@@ -36,6 +37,7 @@ jobs:
       docker: huggingface/transformers-pytorch-amd-gpu
       ci_event: Scheduled CI (AMD) - mi325
       report_repo_id: optimum-amd/transformers_daily_ci
+      env_file: /etc/podinfo/gha-gpu-isolation-settings
     secrets: inherit
 
   example-ci:
@@ -48,6 +50,7 @@ jobs:
       docker: huggingface/transformers-pytorch-amd-gpu
       ci_event: Scheduled CI (AMD) - mi325
       report_repo_id: optimum-amd/transformers_daily_ci
+      env_file: /etc/podinfo/gha-gpu-isolation-settings
     secrets: inherit
 
   deepspeed-ci:
@@ -60,4 +63,5 @@ jobs:
       docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
       ci_event: Scheduled CI (AMD) - mi325
       report_repo_id: optimum-amd/transformers_daily_ci
+      env_file: /etc/podinfo/gha-gpu-isolation-settings
     secrets: inherit
diff --git a/...flows/self-scheduled-amd-mi300-caller.yml → ...flows/self-scheduled-amd-mi355-caller.yml b/...flows/self-scheduled-amd-mi300-caller.yml → ...flows/self-scheduled-amd-mi355-caller.yml
@@ -1,8 +1,8 @@
-name: Self-hosted runner scale set (AMD mi300 scheduled CI caller)
+name: Self-hosted runner scale set (AMD mi355 scheduled CI caller)
 
 # Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
-# For example, 1gpu scale set: amd-mi300-ci-1gpu
-#              2gpu scale set: amd-mi300-ci-2gpu
+# For example, 1gpu : amd-mi355-ci-1gpu
+#              2gpu : amd-mi355-ci-2gpu
 
 on:
   workflow_run:
@@ -20,9 +20,9 @@ jobs:
     with:
       job: run_models_gpu
       slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi300-ci
+      runner_scale_set: amd-mi355-ci
       docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi300
+      ci_event: Scheduled CI (AMD) - mi355
       report_repo_id: optimum-amd/transformers_daily_ci
     secrets: inherit
 
@@ -32,9 +32,9 @@ jobs:
     with:
       job: run_pipelines_torch_gpu
       slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi300-ci
+      runner_scale_set: amd-mi355-ci
       docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi300
+      ci_event: Scheduled CI (AMD) - mi355
       report_repo_id: optimum-amd/transformers_daily_ci
     secrets: inherit
 
@@ -44,9 +44,9 @@ jobs:
     with:
       job: run_examples_gpu
       slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi300-ci
+      runner_scale_set: amd-mi355-ci
       docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi300
+      ci_event: Scheduled CI (AMD) - mi355
       report_repo_id: optimum-amd/transformers_daily_ci
     secrets: inherit
 
@@ -56,8 +56,8 @@ jobs:
     with:
       job: run_torch_cuda_extensions_gpu
       slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi300-ci
+      runner_scale_set: amd-mi355-ci
       docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi300
+      ci_event: Scheduled CI (AMD) - mi355
       report_repo_id: optimum-amd/transformers_daily_ci
     secrets: inherit
diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml
@@ -1,13 +1,12 @@
-name: Self-hosted runner (scheduled)
-
+name: Nvidia CI
 
 on:
   repository_dispatch:
   schedule:
     - cron: "17 2 * * *"
   push:
     branches:
-      - run_scheduled_ci*
+      - run_nvidia_ci*
   workflow_dispatch:
     inputs:
       prev_workflow_run_id:
@@ -54,6 +53,7 @@ jobs:
       docker: huggingface/transformers-all-latest-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   torch-pipeline:
@@ -65,6 +65,7 @@ jobs:
       docker: huggingface/transformers-pytorch-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   example-ci:
@@ -76,6 +77,7 @@ jobs:
       docker: huggingface/transformers-all-latest-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   trainer-fsdp-ci:
@@ -87,6 +89,7 @@ jobs:
       docker: huggingface/transformers-all-latest-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   deepspeed-ci:
@@ -99,6 +102,7 @@ jobs:
       ci_event: Daily CI
       working-directory-prefix: /workspace
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit
 
   quantization-ci:
@@ -110,4 +114,5 @@ jobs:
       docker: huggingface/transformers-quantization-latest-gpu
       ci_event: Daily CI
       report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
     secrets: inherit