Convert change detection to a Python script

AA-Turner · AA-Turner · commit 819da3b62e0f · 2025-02-03T23:51:40.000Z
diff --git a/.github/workflows/reusable-change-detection.yml b/.github/workflows/reusable-change-detection.yml
@@ -52,122 +52,59 @@ jobs:
     timeout-minutes: 10
     outputs:
       config-hash: ${{ steps.config-hash.outputs.hash }}
-      run-cifuzz: ${{ steps.check.outputs.run-cifuzz }}
-      run-docs: ${{ steps.docs-changes.outputs.run-docs }}
-      run-hypothesis: ${{ steps.check.outputs.run-hypothesis }}
-      run-tests: ${{ steps.check.outputs.run-tests }}
-      run-win-msi: ${{ steps.win-msi-changes.outputs.run-win-msi }}
+      run-cifuzz: ${{ steps.changes.outputs.run-cifuzz }}
+      run-docs: ${{ steps.changes.outputs.run-docs }}
+      run-hypothesis: ${{ steps.changes.outputs.run-hypothesis }}
+      run-tests: ${{ steps.changes.outputs.run-tests }}
+      run-win-msi: ${{ steps.changes.outputs.run-win-msi }}
     steps:
+    - uses: actions/setup-python@v5
+      with:
+        python-version: "3"
+
     - run: >-
         echo '${{ github.event_name }}'
+
     - uses: actions/checkout@v4
       with:
         persist-credentials: false
-    - name: Check for source changes
-      id: check
+        ref: >-
+          ${{
+            github.event_name == 'pull_request'
+            && github.event.pull_request.head.sha
+            || ''
+          }}
+
+    # Adapted from https://github.com/actions/checkout/issues/520#issuecomment-1167205721
+    - name: Fetch commits to get branch diff
+      if: github.event_name == 'pull_request'
       run: |
-        if [ -z "$GITHUB_BASE_REF" ]; then
-          echo "run-tests=true" >> "$GITHUB_OUTPUT"
-        else
-          git fetch origin "$GITHUB_BASE_REF" --depth=1
-          # git diff "origin/$GITHUB_BASE_REF..." (3 dots) may be more
-          # reliable than git diff "origin/$GITHUB_BASE_REF.." (2 dots),
-          # but it requires to download more commits (this job uses
-          # "git fetch --depth=1").
-          #
-          # git diff "origin/$GITHUB_BASE_REF..." (3 dots) works with Git
-          # 2.26, but Git 2.28 is stricter and fails with "no merge base".
-          #
-          # git diff "origin/$GITHUB_BASE_REF.." (2 dots) should be enough on
-          # GitHub, since GitHub starts by merging origin/$GITHUB_BASE_REF
-          # into the PR branch anyway.
-          #
-          # https://github.com/python/core-workflow/issues/373
-          grep_ignore_args=(
-            # file extensions
-              -e '\.md$'
-              -e '\.rst$'
-            # top-level folders
-              -e '^Doc/'
-              -e '^Misc/'
-            # configuration files
-              -e '^\.github/CODEOWNERS$'
-              -e '^\.pre-commit-config\.yaml$'
-              -e '\.ruff\.toml$'
-              -e 'mypy\.ini$'
-          )
-          git diff --name-only "origin/$GITHUB_BASE_REF.."        \
-            | grep -qvE "${grep_ignore_args[@]}"                  \
-            && echo "run-tests=true" >> "$GITHUB_OUTPUT" || true
-        fi
+        set -eux
+
+        # Fetch enough history to find a common ancestor commit (aka merge-base):
+        git fetch origin "${refspec_pr}" --depth=$(( commits + 1 )) \
+          --no-tags --prune --no-recurse-submodules
+
+        # This should get the oldest commit in the local fetched history (which may not be the commit the PR branched from):
+        COMMON_ANCESTOR=$( git rev-list --first-parent --max-parents=0 --max-count=1 "${branch_pr}" )
+        DATE=$( git log --date=iso8601 --format=%cd "${COMMON_ANCESTOR}" )
 
-        # Check if we should run hypothesis tests
-        GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
-        echo "$GIT_BRANCH"
-        if $(echo "$GIT_BRANCH" | grep -q -w '3\.\(8\|9\|10\|11\)'); then
-          echo "Branch too old for hypothesis tests"
-          echo "run-hypothesis=false" >> "$GITHUB_OUTPUT"
-        else
-          echo "Run hypothesis tests"
-          echo "run-hypothesis=true" >> "$GITHUB_OUTPUT"
-        fi
+        # Get all commits since that commit date from the base branch (eg: master or main):
+        git fetch origin "${refspec_base}" --shallow-since="${DATE}" \
+          --no-tags --prune --no-recurse-submodules
+      env:
+        branch_pr: 'origin/${{ github.event.pull_request.head.ref }}'
+        commits: ${{ github.event.pull_request.commits }}
+        refspec_base: '+${{ github.event.pull_request.base.sha }}:remotes/origin/${{ github.event.pull_request.base.ref }}'
+        refspec_pr: '+${{ github.event.pull_request.head.sha }}:remotes/origin/${{ github.event.pull_request.head.ref }}'
+
+    # We only want to run tests on PRs when related files are changed,
+    # or when someone triggers a manual workflow run.
+    - name: Compute changed files
+      id: changes
+      run: python Tools/build/compute-changes.py
 
-        # oss-fuzz maintains a configuration for fuzzing the main branch of
-        # CPython, so CIFuzz should be run only for code that is likely to be
-        # merged into the main branch; compatibility with older branches may
-        # be broken.
-        FUZZ_RELEVANT_FILES='(\.c$|\.h$|\.cpp$|^configure$|^\.github/workflows/build\.yml$|^Modules/_xxtestfuzz)'
-        if [ "$GITHUB_BASE_REF" = "main" ] && [ "$(git diff --name-only "origin/$GITHUB_BASE_REF.." | grep -qE $FUZZ_RELEVANT_FILES; echo $?)" -eq 0 ]; then
-          # The tests are pretty slow so they are executed only for PRs
-          # changing relevant files.
-          echo "Run CIFuzz tests"
-          echo "run-cifuzz=true" >> "$GITHUB_OUTPUT"
-        else
-          echo "Branch too old for CIFuzz tests; or no C files were changed"
-          echo "run-cifuzz=false" >> "$GITHUB_OUTPUT"
-        fi
     - name: Compute hash for config cache key
       id: config-hash
       run: |
         echo "hash=${{ hashFiles('configure', 'configure.ac', '.github/workflows/build.yml') }}" >> "$GITHUB_OUTPUT"
-    - name: Get a list of the changed documentation-related files
-      if: github.event_name == 'pull_request'
-      id: changed-docs-files
-      uses: Ana06/get-changed-files@v2.3.0
-      with:
-        filter: |
-          Doc/**
-          Misc/**
-          .github/workflows/reusable-docs.yml
-        format: csv  # works for paths with spaces
-    - name: Check for docs changes
-      # We only want to run this on PRs when related files are changed,
-      # or when user triggers manual workflow run.
-      if: >-
-        (
-          github.event_name == 'pull_request'
-          && steps.changed-docs-files.outputs.added_modified_renamed != ''
-        ) || github.event_name == 'workflow_dispatch'
-      id: docs-changes
-      run: |
-        echo "run-docs=true" >> "${GITHUB_OUTPUT}"
-    - name: Get a list of the MSI installer-related files
-      if: github.event_name == 'pull_request'
-      id: changed-win-msi-files
-      uses: Ana06/get-changed-files@v2.3.0
-      with:
-        filter: |
-          Tools/msi/**
-          .github/workflows/reusable-windows-msi.yml
-        format: csv  # works for paths with spaces
-    - name: Check for changes in MSI installer-related files
-      # We only want to run this on PRs when related files are changed,
-      # or when user triggers manual workflow run.
-      if: >-
-        (
-          github.event_name == 'pull_request'
-          && steps.changed-win-msi-files.outputs.added_modified_renamed != ''
-        ) || github.event_name == 'workflow_dispatch'
-      id: win-msi-changes
-      run: |
-        echo "run-win-msi=true" >> "${GITHUB_OUTPUT}"
diff --git a/Tools/build/compute-changes.py b/Tools/build/compute-changes.py
@@ -0,0 +1,199 @@
+"""Determine which GitHub Actions workflows to run.
+
+Called by ``.github/workflows/reusable-change-detection.yml``.
+We only want to run tests on PRs when related files are changed,
+or when someone triggers a manual workflow run.
+This improves developer experience by not doing (slow)
+unnecessary work in GHA, and saves CI resources.
+"""
+
+from __future__ import annotations
+
+import os
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from collections.abc import Set
+
+GITHUB_CODEOWNERS_PATH = Path(".github/CODEOWNERS")
+GITHUB_WORKFLOWS_PATH = Path(".github/workflows")
+CONFIGURATION_FILE_NAMES = frozenset({
+    ".pre-commit-config.yaml",
+    ".ruff.toml",
+    "mypy.ini",
+})
+SUFFIXES_DOCUMENTATION = frozenset({".rst", ".md"})
+SUFFIXES_C_OR_CPP = frozenset({".c", ".h", ".cpp"})
+
+
+@dataclass(kw_only=True, slots=True)
+class Outputs:
+    run_ci_fuzz: bool = False
+    run_docs: bool = False
+    run_hypothesis: bool = False
+    run_tests: bool = False
+    run_windows_msi: bool = False
+
+
+def compute_changes():
+    target_branch, head_branch = git_branches()
+    if target_branch and head_branch:
+        # Getting changed files only makes sense on a pull request
+        files = get_changed_files(
+            f"origin/{target_branch}", f"origin/{head_branch}"
+        )
+        outputs = process_changed_files(files)
+    else:
+        # Otherwise, just run the tests
+        outputs = Outputs(run_tests=True)
+    outputs = process_target_branch(outputs, target_branch)
+
+    if outputs.run_tests:
+        print("Run tests")
+
+    if outputs.run_hypothesis:
+        print("Run hypothesis tests")
+
+    if outputs.run_ci_fuzz:
+        print("Run CIFuzz tests")
+    else:
+        print("Branch too old for CIFuzz tests; or no C files were changed")
+
+    if outputs.run_docs:
+        print("Build documentation")
+
+    if outputs.run_windows_msi:
+        print("Build Windows MSI")
+
+    print(outputs)
+
+    write_github_output(outputs)
+
+
+def git_branches() -> tuple[str, str]:
+    target_branch = os.environ.get("GITHUB_BASE_REF", "")
+    target_branch = target_branch.removeprefix("refs/heads/")
+    print(f"target branch: {target_branch!r}")
+
+    head_branch = os.environ.get("GITHUB_HEAD_REF", "")
+    head_branch = head_branch.removeprefix("refs/heads/")
+    print(f"head branch: {head_branch!r}")
+    return target_branch, head_branch
+
+
+def get_changed_files(ref_a: str = "main", ref_b: str = "HEAD") -> Set[Path]:
+    """List the files changed between two Git refs, filtered by change type."""
+    args = ("git", "diff", "--name-only", f"{ref_a}...{ref_b}", "--")
+    print(*args)
+    changed_files_result = subprocess.run(
+        args, stdout=subprocess.PIPE, check=True, encoding="utf-8"
+    )
+    changed_files = changed_files_result.stdout.strip().splitlines()
+    return frozenset(map(Path, filter(None, map(str.strip, changed_files))))
+
+
+def process_changed_files(changed_files: Set[Path]) -> Outputs:
+    run_tests = False
+    run_ci_fuzz = False
+    run_docs = False
+    run_windows_msi = False
+
+    for file in changed_files:
+        file_name = file.name
+        file_suffix = file.suffix
+        file_parts = file.parts
+
+        # Documentation files
+        doc_or_misc = file_parts[0] in {"Doc", "Misc"}
+        doc_file = file_suffix in SUFFIXES_DOCUMENTATION or doc_or_misc
+
+        if file.parent == GITHUB_WORKFLOWS_PATH:
+            if file_name == "build.yml":
+                run_tests = run_ci_fuzz = True
+            if file_name == "reusable-docs.yml":
+                run_docs = True
+            if file_name == "reusable-windows-msi.yml":
+                run_windows_msi = True
+
+        if not (
+            doc_file
+            or file == GITHUB_CODEOWNERS_PATH
+            or file_name in CONFIGURATION_FILE_NAMES
+        ):
+            run_tests = True
+
+        # The fuzz tests are pretty slow so they are executed only for PRs
+        # changing relevant files.
+        if file_suffix in SUFFIXES_C_OR_CPP:
+            run_ci_fuzz = True
+        if file_parts[:2] in {
+            ("configure",),
+            ("Modules", "_xxtestfuzz"),
+        }:
+            run_ci_fuzz = True
+
+        # Check for changed documentation-related files
+        if doc_file:
+            run_docs = True
+
+        # Check for changed MSI installer-related files
+        if file_parts[:2] == ("Tools", "msi"):
+            run_windows_msi = True
+
+    return Outputs(
+        run_ci_fuzz=run_ci_fuzz,
+        run_docs=run_docs,
+        run_tests=run_tests,
+        run_windows_msi=run_windows_msi,
+    )
+
+
+def process_target_branch(outputs: Outputs, git_branch: str) -> Outputs:
+    if not git_branch:
+        outputs.run_tests = True
+
+    # Check if we should run the hypothesis tests
+    if git_branch in {"3.8", "3.9", "3.10", "3.11"}:
+        print("Branch too old for hypothesis tests")
+        outputs.run_hypothesis = False
+    else:
+        outputs.run_hypothesis = outputs.run_tests
+
+    # oss-fuzz maintains a configuration for fuzzing the main branch of
+    # CPython, so CIFuzz should be run only for code that is likely to be
+    # merged into the main branch; compatibility with older branches may
+    # be broken.
+    if git_branch != "main":
+        outputs.run_ci_fuzz = False
+
+    if os.environ.get("GITHUB_EVENT_NAME", "").lower() == "workflow_dispatch":
+        outputs.run_docs = True
+        outputs.run_windows_msi = True
+
+    return outputs
+
+
+def write_github_output(outputs: Outputs) -> None:
+    # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
+    # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-an-output-parameter
+    if "GITHUB_OUTPUT" not in os.environ:
+        print("GITHUB_OUTPUT not defined!")
+        return
+
+    with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as f:
+        f.write(f"run-cifuzz={bool_lower(outputs.run_ci_fuzz)}\n")
+        f.write(f"run-docs={bool_lower(outputs.run_docs)}\n")
+        f.write(f"run-hypothesis={bool_lower(outputs.run_hypothesis)}\n")
+        f.write(f"run-tests={bool_lower(outputs.run_tests)}\n")
+        f.write(f"run-win-msi={bool_lower(outputs.run_windows_msi)}\n")
+
+
+def bool_lower(value: bool, /) -> str:
+    return "true" if value else "false"
+
+
+if __name__ == "__main__":
+    compute_changes()