fix!: remove post processing of base_ref and base_rev parameters

ahal · ahal · commit 90a856b0bbd7 · 2025-10-14T14:25:23.000-04:00
BREAKING CHANGE: the base_ref and base_rev parameters will now match
what was passed into the Decision task exactly. This means `base_ref`
will not get reset to the repo's default branch, and `base_rev` will no
longer be reset to the merge base.

### Why we were previously post-processing these parameters

Github push and pull_request events almost never contain the base ref,
and often don't contain the base rev. For base ref, it only ever seems
to be present when pushing a new branch. For base rev it is missing in
the following cases:

1. All pull_request events. The value we pass in is actually the
   revision that the base ref happens to be pointing at when the event
   is fired.
2. Force pushes. The value we pass in is actually the old unreachable
   head rev that used to be on the branch

Really, we only ever get a proper base rev when doing normal
fast-forward pushes.

The reason we need a base rev in the first place, is to compute the
files changed. So let's say there's a PR with the following graph:

    A -&gt; B -&gt; C -&gt; D (main)
         \
          E -&gt; F (PR)


Based on the Github event, `D` is being passed in as `base_rev` and `F`
is being passed in as `head_rev`. The post-processing was essentially
running `git merge-base` to find `B`, so that we could then use `git
log` to find all files touched between `B..F`.

### Why the post-processing isn't actually necessary

It turns out that `git log D..F` already does the right thing! It means,
show me all commits reachable from `F`, but not reachable from `D`,
which is exactly what we want. Only files changed in `E` and `F` will be
included. So there's no benefit of using `merge-base` for the purposes
of finding changed files.

As far as the `base_ref` post-processing goes, all we were doing was
setting it to `repo.default_branch` if it doesn't exist (which is almost
always for pushes). While this likely works in most cases, we don't
actually have any idea what the real `base_ref` is, so it's not correct
to be doing this.

Since we don't even need `base_ref` to determine files changed, I think
it's fine to leave it as `None` in the event it wasn't passed in.

### Risks with this patch

While removing this should be fine for the purposes of determining
changed files, there might be other use cases that projects are using
the `merge-base` for. Such projects may need to determine the merge-base
on their own, therefore this patch is backwards incompatible.

### Future improvements

I think we should consider removing or renaming the `base_ref` and
`base_rev` parameters. As outlined, they are misleading (at least with
Github based repos) and don't actually represent the "base revision".

But for now, I'll save that particular change for another day.
diff --git a/src/taskgraph/decision.py b/src/taskgraph/decision.py
@@ -21,7 +21,7 @@
 from taskgraph.util import json
 from taskgraph.util.python_path import find_object
 from taskgraph.util.schema import Schema, validate_schema
-from taskgraph.util.vcs import Repository, get_repository
+from taskgraph.util.vcs import get_repository
 from taskgraph.util.yaml import load_yaml
 
 logger = logging.getLogger(__name__)
@@ -192,25 +192,10 @@ def get_decision_parameters(graph_config, options):
     except UnicodeDecodeError:
         commit_message = ""
 
-    parameters["base_ref"] = _determine_more_accurate_base_ref(
-        repo,
-        candidate_base_ref=options.get("base_ref"),
-        head_ref=options.get("head_ref"),
-        base_rev=options.get("base_rev"),
-    )
-
-    parameters["base_rev"] = _determine_more_accurate_base_rev(
-        repo,
-        base_ref=parameters["base_ref"],
-        candidate_base_rev=options.get("base_rev"),
-        head_rev=options.get("head_rev"),
-        env_prefix=_get_env_prefix(graph_config),
-    )
-
     # Define default filter list, as most configurations shouldn't need
     # custom filters.
     parameters["files_changed"] = repo.get_changed_files(
-        rev=parameters["head_rev"], base_rev=parameters["base_rev"]
+        rev=parameters["head_rev"], base=parameters["base_rev"]
     )
     parameters["filters"] = [
         "target_tasks_method",
@@ -284,68 +269,6 @@ def get_decision_parameters(graph_config, options):
     return result
 
 
-def _determine_more_accurate_base_ref(repo, candidate_base_ref, head_ref, base_rev):
-    base_ref = candidate_base_ref
-
-    if not candidate_base_ref:
-        base_ref = repo.default_branch
-    elif candidate_base_ref == head_ref and base_rev == Repository.NULL_REVISION:
-        logger.info(
-            "base_ref and head_ref are identical but base_rev equals the null revision. "
-            "This is a new branch but Github didn't identify its actual base."
-        )
-        base_ref = repo.default_branch
-
-    if base_ref != candidate_base_ref:
-        logger.info(
-            f'base_ref has been reset from "{candidate_base_ref}" to "{base_ref}".'
-        )
-
-    return base_ref
-
-
-def _determine_more_accurate_base_rev(
-    repo, base_ref, candidate_base_rev, head_rev, env_prefix
-):
-    if not candidate_base_rev:
-        logger.info("base_rev is not set.")
-        base_ref_or_rev = base_ref
-    elif candidate_base_rev == Repository.NULL_REVISION:
-        logger.info("base_rev equals the null revision. This branch is a new one.")
-        base_ref_or_rev = base_ref
-    elif not repo.does_revision_exist_locally(candidate_base_rev):
-        logger.warning(
-            "base_rev does not exist locally. It is likely because the branch was force-pushed. "
-            "taskgraph is not able to assess how many commits were changed and assumes it is only "
-            f"the last one. Please set the {env_prefix.upper()}_BASE_REV environment variable "
-            "in the decision task and provide `--base-rev` to taskgraph."
-        )
-        base_ref_or_rev = base_ref
-    else:
-        base_ref_or_rev = candidate_base_rev
-
-    if base_ref_or_rev == base_ref:
-        logger.info(
-            f'Using base_ref "{base_ref}" to determine latest common revision...'
-        )
-
-    base_rev = repo.find_latest_common_revision(base_ref_or_rev, head_rev)
-    if base_rev != candidate_base_rev:
-        if base_ref_or_rev == candidate_base_rev:
-            logger.info("base_rev is not an ancestor of head_rev.")
-
-        logger.info(
-            f'base_rev has been reset from "{candidate_base_rev}" to "{base_rev}".'
-        )
-
-    return base_rev
-
-
-def _get_env_prefix(graph_config):
-    repo_keys = list(graph_config["taskgraph"].get("repositories", {}).keys())
-    return repo_keys[0] if repo_keys else ""
-
-
 def set_try_config(parameters, task_config_file):
     if os.path.isfile(task_config_file):
         logger.info(f"using try tasks from {task_config_file}")
diff --git a/src/taskgraph/parameters.py b/src/taskgraph/parameters.py
@@ -33,7 +33,7 @@ class ParameterMismatch(Exception):
 base_schema = Schema(
     {
         Required("base_repository"): str,
-        Required("base_ref"): str,
+        Optional("base_ref"): str,
         Required("base_rev"): str,
         Required("build_date"): int,
         Required("build_number"): int,
diff --git a/test/test_decision.py b/test/test_decision.py
@@ -10,8 +10,6 @@
 import unittest
 from pathlib import Path
 
-import pytest
-
 from taskgraph import decision
 from taskgraph.util.vcs import GitRepository, HgRepository
 from taskgraph.util.yaml import load_yaml
@@ -54,6 +52,7 @@ class TestGetDecisionParameters(unittest.TestCase):
     def setUp(self):
         self.options = {
             "base_repository": "https://hg.mozilla.org/mozilla-unified",
+            "base_rev": "aaaa",
             "head_repository": "https://hg.mozilla.org/mozilla-central",
             "head_rev": "bbbb",
             "head_ref": "default",
@@ -67,27 +66,11 @@ def setUp(self):
             "tasks_for": "hg-push",
             "level": "3",
         }
-        self.old_determine_more_accurate_base_rev = (
-            decision._determine_more_accurate_base_rev
-        )
-        decision._determine_more_accurate_base_rev = lambda *_, **__: "aaaa"
-        self.old_determine_more_accurate_base_ref = (
-            decision._determine_more_accurate_base_ref
-        )
-        decision._determine_more_accurate_base_ref = lambda *_, **__: "aaaa"
-
-    def tearDown(self):
-        decision._determine_more_accurate_base_rev = (
-            self.old_determine_more_accurate_base_rev
-        )
-        decision._determine_more_accurate_base_ref = (
-            self.old_determine_more_accurate_base_ref
-        )
 
     def test_simple_options(self, mock_files_changed):
         mock_files_changed.return_value = ["foo.txt"]
         params = decision.get_decision_parameters(FAKE_GRAPH_CONFIG, self.options)
-        mock_files_changed.assert_called_once_with(rev="bbbb", base_rev="aaaa")
+        mock_files_changed.assert_called_once_with(rev="bbbb", base="aaaa")
         self.assertEqual(params["build_date"], 1503691511)
         self.assertEqual(params["head_tag"], "v0.0.1")
         self.assertEqual(params["pushlog_id"], "143")
@@ -154,73 +137,3 @@ def test_dontbuild_commit_message_yields_default_target_tasks_method(
         self.options["tasks_for"] = "hg-push"
         params = decision.get_decision_parameters(FAKE_GRAPH_CONFIG, self.options)
         self.assertEqual(params["target_tasks_method"], "nothing")
-
-
-@pytest.mark.parametrize(
-    "candidate_base_ref, base_rev, expected_base_ref",
-    (
-        ("", "base-rev", "default-branch"),
-        ("head-ref", "base-rev", "head-ref"),
-        ("head-ref", "0000000000000000000000000000000000000000", "default-branch"),
-    ),
-)
-def test_determine_more_accurate_base_ref(
-    candidate_base_ref, base_rev, expected_base_ref
-):
-    repo_mock = unittest.mock.MagicMock()
-    repo_mock.default_branch = "default-branch"
-
-    assert (
-        decision._determine_more_accurate_base_ref(
-            repo_mock, candidate_base_ref, "head-ref", base_rev
-        )
-        == expected_base_ref
-    )
-
-
-@pytest.mark.parametrize(
-    "common_rev, candidate_base_rev, expected_base_ref_or_rev, expected_base_rev",
-    (
-        ("found-rev", "", "base-ref", "found-rev"),
-        (
-            "found-rev",
-            "0000000000000000000000000000000000000000",
-            "base-ref",
-            "found-rev",
-        ),
-        ("found-rev", "non-existing-rev", "base-ref", "found-rev"),
-        ("existing-rev", "existing-rev", "existing-rev", "existing-rev"),
-    ),
-)
-def test_determine_more_accurate_base_rev(
-    common_rev, candidate_base_rev, expected_base_ref_or_rev, expected_base_rev
-):
-    repo_mock = unittest.mock.MagicMock()
-    repo_mock.find_latest_common_revision.return_value = common_rev
-    repo_mock.does_revision_exist_locally = lambda rev: rev == "existing-rev"
-
-    assert (
-        decision._determine_more_accurate_base_rev(
-            repo_mock, "base-ref", candidate_base_rev, "head-rev", env_prefix="PREFIX"
-        )
-        == expected_base_rev
-    )
-    repo_mock.find_latest_common_revision.assert_called_once_with(
-        expected_base_ref_or_rev, "head-rev"
-    )
-
-
-@pytest.mark.parametrize(
-    "graph_config, expected_value",
-    (
-        ({"taskgraph": {}}, ""),
-        ({"taskgraph": {"repositories": {}}}, ""),
-        ({"taskgraph": {"repositories": {"mobile": {}}}}, "mobile"),
-        (
-            {"taskgraph": {"repositories": {"mobile": {}, "some-other-repo": {}}}},
-            "mobile",
-        ),
-    ),
-)
-def test_get_env_prefix(graph_config, expected_value):
-    assert decision._get_env_prefix(graph_config) == expected_value

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ class ParameterMismatch(Exception):`
`33`	`33`	`base_schema = Schema(`
`34`	`34`	`{`
`35`	`35`	`Required("base_repository"): str,`
`36`		`- Required("base_ref"): str,`
	`36`	`+ Optional("base_ref"): str,`
`37`	`37`	`Required("base_rev"): str,`
`38`	`38`	`Required("build_date"): int,`
`39`	`39`	`Required("build_number"): int,`