[autorevert] Inject synthetic PENDING events for pending workflows in signal extraction (#7272)

izaitsevfb · web-flow · commit 5903a753077f · 2025-10-02T09:21:45.000-07:00
Summary: - Add a post‑processing step in signal extraction that injects synthetic PENDING events for pending workflows when the workflow haven't (yet) produced more precise events for this (signal, commit) - Implemented via a simple pass over JobRows keyed by (sha, workflow) → pending wf_run_ids. - Insert synthetic events only when a pending wf_run_id exists and **the signal has no existing event for that run.** - Use started_at = now + 1 minute Rationale: - Multi‑stage workflows (build → test) delay job scheduling, leaving signals “empty” for active runs. - Choosing “now + 1 minute” ensures synthetic events are in the future (good proxy for the actual job start time) - This is especially useful for bisection to avoid duplication of the already scheduled workflows (CH deduplication is not enough when the job takes 30+ minutes to appear) ### Testing e.g. see pending events in the right column here: [2025-10-01T22-31-22.115571-00-00.html](https://github.com/user-attachments/files/22650153/2025-10-01T22-31-22.115571-00-00.html)
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal_extraction.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal_extraction.py
@@ -8,7 +8,7 @@
 """
 
 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 from typing import Dict, Iterable, List, Optional, Set, Tuple
 
 from .job_agg_index import JobAggIndex, JobMeta, SignalStatus as AggStatus
@@ -92,7 +92,11 @@ def extract(self) -> List[Signal]:
         # When that happens, the jobs have identical timestamps by DIFFERENT job ids.
         # But since they are still the same job logically, we want to deduplicate them
         # for the purposes of signal events.
-        return self._dedup_signal_events(test_signals + job_signals)
+        signals = self._dedup_signal_events(test_signals + job_signals)
+
+        # Inject synthetic PENDING events for workflow runs that are known to be
+        # pending but have no events in a given signal (e.g. multi-stage workflows).
+        return self._inject_pending_workflow_events(signals, jobs)
 
     # -----------------------------
     # Deduplication (GitHub-specific)
@@ -125,6 +129,90 @@ def _dedup_signal_events(self, signals: List[Signal]) -> List[Signal]:
             )
         return deduped
 
+    # -----------------------------
+    # Pending workflow synthesis
+    # -----------------------------
+    def _inject_pending_workflow_events(
+        self,
+        signals: List[Signal],
+        jobs: List[JobRow],
+    ) -> List[Signal]:
+        """
+        For each signal/commit, if there exists a pending workflow run and the
+        signal has no event for that wf_run_id, insert a synthetic PENDING event
+        with started_at set slightly in the future (now + 1 minute).
+        """
+        if not signals or not jobs:
+            return signals
+
+        # Simple pass over JobRows to collect pending workflow run ids per (sha, workflow)
+        pending_runs: Dict[Tuple[Sha, WorkflowName], Set[int]] = {}
+        for j in jobs:
+            if j.is_pending:
+                pending_runs.setdefault((j.head_sha, j.workflow_name), set()).add(
+                    int(j.wf_run_id)
+                )
+
+        # Avoid deprecated utcnow(); derive UTC then store naive to match existing code
+        now_plus = (datetime.now(timezone.utc) + timedelta(minutes=1)).replace(
+            tzinfo=None
+        )
+
+        out: List[Signal] = []
+        for s in signals:
+            new_commits: List[SignalCommit] = []
+            for c in s.commits:
+                pending_ids = pending_runs.get(
+                    (Sha(c.head_sha), WorkflowName(s.workflow_name))
+                )
+                if not pending_ids:
+                    new_commits.append(c)
+                    continue
+
+                have_ids = {e.wf_run_id for e in c.events}
+                missing_ids = pending_ids - have_ids
+                if not missing_ids:
+                    new_commits.append(c)
+                    continue
+
+                # Build synthetic pending events for the missing wf_run_ids
+                # set started_at to the future
+                synth_events: List[SignalEvent] = list(c.events)
+                for wf_run_id in missing_ids:
+                    name = self._fmt_event_name(
+                        workflow=s.workflow_name,
+                        kind="synthetic",
+                        identifier=str(s.key),
+                        wf_run_id=WfRunId(wf_run_id),
+                        run_attempt=RunAttempt(0),
+                    )
+                    synth_events.append(
+                        SignalEvent(
+                            name=name,
+                            status=SignalStatus.PENDING,
+                            started_at=now_plus,
+                            ended_at=None,
+                            wf_run_id=int(wf_run_id),
+                            run_attempt=0,
+                            job_id=None,
+                        )
+                    )
+                new_commits.append(
+                    SignalCommit(
+                        head_sha=c.head_sha, timestamp=c.timestamp, events=synth_events
+                    )
+                )
+
+            out.append(
+                Signal(
+                    key=s.key,
+                    workflow_name=s.workflow_name,
+                    commits=new_commits,
+                    job_base_name=s.job_base_name,
+                )
+            )
+        return out
+
     # -----------------------------
     # Phase B — Tests (test_run_s3 only)
     # -----------------------------
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal_extraction.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal_extraction.py
@@ -386,6 +386,62 @@ def test_test_track_mapping_failure_then_success(self):
         self.assertEqual(test_sig.commits[0].events[0].status, SignalStatus.FAILURE)
         self.assertEqual(test_sig.commits[1].events[0].status, SignalStatus.SUCCESS)
 
+    def test_inject_pending_workflow_event_when_missing_in_signal(self):
+        # Multi-stage workflow: newest commit has a pending workflow run (build stage),
+        # tests not yet scheduled -> no events for that wf_run_id in the test signal.
+        # Older commit has a test failure so the test signal exists.
+        jobs = [
+            # Newest commit: pending build job under wf_run_id=200
+            J(
+                sha="H2",
+                wf="trunk",
+                run=200,
+                job=901,
+                attempt=1,
+                name="linux-build",
+                status="in_progress",
+                conclusion="",
+                started_at=ts(self.t0, 20),
+            ),
+            # Older commit: test job that failed with a concrete test verdict
+            J(
+                sha="H1",
+                wf="trunk",
+                run=190,
+                job=902,
+                attempt=1,
+                name="linux-test",
+                status="completed",
+                conclusion="failure",
+                started_at=ts(self.t0, 10),
+                rule="pytest failure",
+            ),
+        ]
+        tests = [
+            T(
+                job=902,
+                run=190,
+                attempt=1,
+                file="m.py",
+                name="test_synthetic_pending",
+                failing=1,
+            )
+        ]
+
+        signals = self._extract(jobs, tests)
+        test_sig = self._find_test_signal(
+            signals, "trunk", "m.py::test_synthetic_pending"
+        )
+        self.assertIsNotNone(test_sig)
+        # Expect two commits in newest->older order
+        self.assertEqual([c.head_sha for c in test_sig.commits], ["H2", "H1"])
+
+        # For the newest commit (H2): we should have a synthetic pending event for wf_run_id=200
+        c_new = test_sig.commits[0]
+        self.assertEqual(len(c_new.events), 1)
+        self.assertEqual(c_new.events[0].status, SignalStatus.PENDING)
+        self.assertEqual(c_new.events[0].wf_run_id, 200)
+
 
 if __name__ == "__main__":
     unittest.main()