[build] implement jobs to retry rbe failures with extra debugging

titusfortner · titusfortner · commit b3c871977475 · 2025-03-18T14:42:25.000-07:00
diff --git a/.github/workflows/bazel.yml b/.github/workflows/bazel.yml
@@ -61,6 +61,16 @@ on:
         required: false
         type: string
         default: ''
+      download-name:
+        description: name of artifact to download
+        required: false
+        type: string
+        default: ''
+      download-path:
+        description: path of artifact to download
+        required: false
+        type: string
+        default: ''
       upload-name:
         description: Name of artifact to upload
         required: false
@@ -161,6 +171,12 @@ jobs:
       - name: Setup curl for Ubuntu
         if: inputs.os == 'ubuntu'
         run: sudo apt-get update && sudo apt-get install -y libcurl4-openssl-dev
+      - name: "Download artifact"
+        if: ${{ inputs.download-name != '' && inputs.download-path != '' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.download-name }}
+          path: ${{ inputs.download-path }}
       - name: Run Bazel
         run: ${{ inputs.run }}
       - name: Start SSH session
diff --git a/.github/workflows/ci-rbe.yml b/.github/workflows/ci-rbe.yml
@@ -29,3 +29,35 @@ jobs:
       upload-name: bazel-logs
       upload-path: bazel-logs
       run: ./scripts/github-actions/ci-build.sh
+  retry-remote:
+    name: Retry Failures on RBE
+    needs: test
+    if: always() && needs.test.result == 'failure' && github.repository_owner == 'seleniumhq' && startsWith(github.head_ref, 'renovate/') != true
+    uses: ./.github/workflows/bazel.yml
+    with:
+      name: RBE Retries
+      caching: false
+      ruby-version: jruby-9.4.12.0
+      download-name: bazel-logs
+      download-path: bazel-logs
+      upload-name: retry-remote-logs
+      upload-path: bazel-logs
+      run: |
+        ./go retry_failed_tests "${BAZEL_LOG_FILE}" true
+  retry-local:
+    name: Retry Failures on GHA
+    needs: test
+    if: always() && needs.test.result == 'failure' && github.repository_owner == 'seleniumhq' && startsWith(github.head_ref, 'renovate/') != true
+    uses: ./.github/workflows/bazel.yml
+    with:
+      name: GHA Retries
+      caching: false
+      ruby-version: jruby-9.4.12.0
+      browser: needs-xvfb
+      java-version: 17
+      download-name: bazel-logs
+      download-path: bazel-logs
+      upload-name: retry-local-logs
+      upload-path: bazel-logs
+      run: |
+        ./go retry_failed_tests "${BAZEL_LOG_FILE}" false
diff --git a/Rakefile b/Rakefile
@@ -274,6 +274,45 @@ end
 task test_py: [:py_prep_for_install_release, 'py:marionette_test']
 task build: %i[all firefox remote selenium tests]
 
+desc 'Retry failed tests from a log file'
+task :retry_failed_tests, [:log_file, :rbe] do |_task, arguments|
+  log_file = arguments[:log_file]
+
+  raise 'Error: Please provide a bazel log file containing test results.' if log_file.nil?
+  raise "Error: Log file '#{log_file}' does not exist." unless File.exist?(log_file)
+
+  rbe = arguments[:rbe]
+  failing_tests = []
+
+  File.readlines(log_file).reverse_each do |line|
+    if line.match?(%r{//.+:.*FAILED})
+      failing_tests << line.strip.split[0]
+    elsif !failing_tests.empty? && line.match?(%r{//.+:.*PASSED})
+      break
+    end
+  end
+
+  puts "Found #{failing_tests.size} failing tests; Retrying"
+
+  retry_args = arguments.extras + %w[--test_output=streamed --test_env DEBUG=true]
+  retry_args << (rbe == 'true' ? '--config=remote' : '--pin_browsers')
+
+  retry_failures = false
+  failing_tests.each do |failed_target|
+    target_name = failed_target.split('/').last.tr(':', '_')
+    target_name += rbe ? '_rbe' : '_gha'
+    retry_logs = log_file.sub('.log', "_#{target_name}_retry.log")
+    begin
+      Bazel.execute('test', retry_args, failed_target, verbose: true, log_file: retry_logs)
+    rescue StandardError => e
+      retry_failures = true
+      puts "Failed retry of #{failed_target}: #{e.message}"
+    end
+  end
+
+  raise 'Some tests failed during retry' if retry_failures
+end
+
 desc 'Clean build artifacts.'
 task :clean do
   rm_rf 'build/'