From 2c528a298a87b7158842daa38edf99cc57c46b63 Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Fri, 11 Jul 2025 08:04:34 -0700 Subject: [PATCH 1/7] script updates --- tools/analytics/github_analyze.py | 131 +++++++++++++++++++++++++----- 1 file changed, 109 insertions(+), 22 deletions(-) diff --git a/tools/analytics/github_analyze.py b/tools/analytics/github_analyze.py index c1eac057b1..48a296cb87 100644 --- a/tools/analytics/github_analyze.py +++ b/tools/analytics/github_analyze.py @@ -100,7 +100,7 @@ def get_ghf_revert_revision(commit: GitCommit) -> Optional[str]: rc is not None, ] ): - return rc.group(1) + return rc.group(1) if rc else None return None @@ -553,48 +553,96 @@ def extract_commit_hash_from_revert(text): Returns: str or None: The extracted commit hash, or None if not found """ - # Pattern to match "This reverts commit ." - pattern = r"This reverts commit ([0-9a-f]+)\." - - match = re.search(pattern, text) - if match: - return match.group(1) + # Enhanced patterns to match various PyTorch revert formats + patterns = [ + r"This reverts commit ([0-9a-f]{40})\.", # Full 40-char hash + r"This reverts commit ([0-9a-f]{7,40})\.", # Variable length hash + r"reverts commit ([0-9a-f]{7,40})", # Case insensitive + r"Revert.*commit ([0-9a-f]{7,40})", # Flexible revert format + r"Back out.*([0-9a-f]{7,40})", # Back out format + ] + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + return match.group(1) return None -def analyze_reverts_missing_from_branch(repo: GitRepo, branch: str) -> None: +def analyze_reverts_missing_from_branch( + repo: GitRepo, branch: str, fork_cut_date: Optional[datetime] = None +) -> None: """ Analyze reverts applied to main branch but not applied to specified branch. This identifies potential missing revert commits that may need to be cherry-picked - to the release branch. Also detects if reverted commits from main were cherry-picked - to the branch. + to maintain parity with main. Enhanced to validate fork branch contents and timing. + + Args: + repo: GitRepo instance + branch: Target branch to analyze (e.g., release branch) + fork_cut_date: Optional date when fork was created from main """ # Get commits from main that are not in the specified branch main_only_commits = build_commit_dict(repo.get_commit_list(branch, "main")) # Get commits from the specified branch that are not in main branch_only_commits = build_commit_dict(repo.get_commit_list("main", branch)) + + # Get all commits in the branch to check original PR presence + all_branch_commits = build_commit_dict(repo._run_git_log(f"orig/{branch}")) + branch_only_reverts = set() + branch_commit_hashes = set(all_branch_commits.keys()) print(f"Analyzing reverts in main branch not present in {branch} branch") + if fork_cut_date: + print(f"Filtering for reverts after fork cut date: {fork_cut_date}") print(f"Total commits in main but not in {branch}: {len(main_only_commits)}") print(f"Total commits in {branch} but not in main: {len(branch_only_commits)}") + print(f"Total commits in {branch}: {len(all_branch_commits)}") print() + # Build set of reverted commits in branch and reverts in branch for commit_hash, commit in branch_only_commits.items(): revert_hash = extract_commit_hash_from_revert(commit.body) - if revert_hash != None: + if revert_hash: branch_only_reverts.add(revert_hash) if is_revert(commit): branch_only_reverts.add(commit_hash) + # Also check all branch commits for existing reverts + for commit_hash, commit in all_branch_commits.items(): + revert_hash = extract_commit_hash_from_revert(commit.body) + if revert_hash: + branch_only_reverts.add(revert_hash) + # Find reverts in main that are not in the specified branch reverts_missing_from_branch = [] + reverts_needing_cherry_pick = [] for commit_hash, commit in main_only_commits.items(): if is_revert(commit): + # Apply fork cut date filter if specified + commit_date = commit.commit_date or commit.author_date + if fork_cut_date and commit_date < fork_cut_date: + continue + reverts_missing_from_branch.append(commit) + # Check if the original reverted commit exists in the branch + reverted_hash = extract_commit_hash_from_revert(commit.body) + if not reverted_hash: + reverted_hash = get_ghf_revert_revision(commit) + + if reverted_hash and reverted_hash in branch_commit_hashes: + reverts_needing_cherry_pick.append( + { + "revert_commit": commit, + "reverted_commit_hash": reverted_hash, + "reason": f"Original commit {reverted_hash[:8]} exists in {branch}", + } + ) + if not reverts_missing_from_branch: print(f"No reverts found in main branch that are missing from {branch} branch.") return @@ -602,14 +650,47 @@ def analyze_reverts_missing_from_branch(repo: GitRepo, branch: str) -> None: print( f"Found {len(reverts_missing_from_branch)} revert(s) in main branch not present in {branch} branch:" ) + print( + f"Of these, {len(reverts_needing_cherry_pick)} revert(s) need cherry-picking for parity:" + ) print("=" * 80) + # First show reverts that definitely need cherry-picking + if reverts_needing_cherry_pick: + print("🔴 REVERTS REQUIRING CHERRY-PICK (original commit exists in branch):") + print("=" * 80) + + for revert_info in reverts_needing_cherry_pick: + commit = revert_info["revert_commit"] + print(f"✅ CHERRY-PICK NEEDED: {commit.commit_hash}") + print(f" Command: git cherry-pick {commit.commit_hash}") + print(f" Date: {commit.commit_date or commit.author_date}") + print(f" Author: {commit.author}") + print(f" Title: {commit.title}") + print(f" Reverted: {revert_info['reverted_commit_hash']}") + print(f" Reason: {revert_info['reason']}") + if commit.pr_url: + print(f" PR URL: {commit.pr_url}") + print("-" * 80) + + print() + print("🟡 OTHER REVERTS (may not need cherry-picking):") + print("=" * 80) + + # Show remaining reverts (those not requiring cherry-pick) + already_shown = { + r["revert_commit"].commit_hash for r in reverts_needing_cherry_pick + } + for commit in reverts_missing_from_branch: + if commit.commit_hash in already_shown: + continue + # Try to identify what was reverted revert_revision = get_revert_revision(commit) ghf_revert_revision = get_ghf_revert_revision(commit) + reverted_commit_hash = extract_commit_hash_from_revert(commit.body) - reverted_commit_hash = None if revert_revision: print(f"Reverted Phabricator Diff: {revert_revision}") elif ghf_revert_revision: @@ -617,13 +698,9 @@ def analyze_reverts_missing_from_branch(repo: GitRepo, branch: str) -> None: reverted_commit_hash = ghf_revert_revision # Check if the reverted commit was cherry-picked to the branch - cherry_picked_to_branch = False - if reverted_commit_hash: - if reverted_commit_hash in branch_only_reverts: - cherry_picked_to_branch = True - print( - f"✅ DETECTED: The reverted commit {reverted_commit_hash} was cherry-picked to {branch}" - ) + cherry_picked_to_branch = ( + reverted_commit_hash and reverted_commit_hash in branch_commit_hashes + ) print(f"Commit Hash: {commit.commit_hash}") print(f"Author: {commit.author}") @@ -632,9 +709,13 @@ def analyze_reverts_missing_from_branch(repo: GitRepo, branch: str) -> None: if commit.pr_url: print(f"PR URL: {commit.pr_url}") - if not cherry_picked_to_branch: + if cherry_picked_to_branch and reverted_commit_hash: + print( + f"âš ī¸ WARNING: Original commit {reverted_commit_hash[:8]} exists in {branch} - consider cherry-picking this revert" + ) + else: print( - f"âš ī¸ STATUS: The reverted commit does not appear to be in {branch}, so this revert may not be needed." + f"â„šī¸ INFO: Original commit not found in {branch} - revert may not be needed" ) print( @@ -668,6 +749,11 @@ def parse_arguments(): action="store_true", help="Analyze reverts applied to main branch but not applied to specified branch", ) + parser.add_argument( + "--fork-cut-date", + type=lambda d: datetime.strptime(d, "%Y-%m-%d"), + help="Date when fork branch was cut from main (YYYY-MM-DD format). Only reverts after this date will be considered.", + ) parser.add_argument("--date", type=lambda d: datetime.strptime(d, "%Y-%m-%d")) parser.add_argument("--issue-num", type=int) return parser.parse_args() @@ -698,7 +784,8 @@ def main(): "Error: --branch argument is required for --analyze-missing-reverts-from-branch" ) return - analyze_reverts_missing_from_branch(repo, args.branch) + fork_cut_date = getattr(args, "fork_cut_date", None) + analyze_reverts_missing_from_branch(repo, args.branch, fork_cut_date) return # Use milestone idx or search it along milestone titles From e242540cb4511cc6ab0c71d57ff5355636a7b6a0 Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Fri, 11 Jul 2025 08:08:46 -0700 Subject: [PATCH 2/7] updated script --- tools/analytics/github_analyze.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/analytics/github_analyze.py b/tools/analytics/github_analyze.py index 48a296cb87..ccf13d7bde 100644 --- a/tools/analytics/github_analyze.py +++ b/tools/analytics/github_analyze.py @@ -139,6 +139,7 @@ def parse_medium_format(lines: Union[str, List[str]]) -> GitCommit: author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())), title=lines[4].strip(), body="\n".join(lines[5:]), + pr_url="", ) @@ -589,7 +590,9 @@ def analyze_reverts_missing_from_branch( branch_only_commits = build_commit_dict(repo.get_commit_list("main", branch)) # Get all commits in the branch to check original PR presence - all_branch_commits = build_commit_dict(repo._run_git_log(f"orig/{branch}")) + all_branch_commits = build_commit_dict( + repo._run_git_log(f"{repo.remote}/orig/{branch}") + ) branch_only_reverts = set() branch_commit_hashes = set(all_branch_commits.keys()) From 0e588fba0a2ea0150b1256fe34549cbd731f8bfb Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Fri, 11 Jul 2025 08:13:22 -0700 Subject: [PATCH 3/7] update workflow --- .github/workflows/github-analytics-daily.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/github-analytics-daily.yml b/.github/workflows/github-analytics-daily.yml index e2b281c6ae..ce4c0aebce 100644 --- a/.github/workflows/github-analytics-daily.yml +++ b/.github/workflows/github-analytics-daily.yml @@ -79,4 +79,5 @@ jobs: --remote origin \ --branch "${BRANCH:-release/2.8}" \ --milestone-id "${MILESTONE:-53}" \ - --missing-in-branch + --fork-cut-date 2024-06-24 + --analyze-missing-reverts-from-branch From 41ed48270c2be76af869518869091d1d4be7f6d5 Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Fri, 11 Jul 2025 08:42:05 -0700 Subject: [PATCH 4/7] making missing-in-branch command work --- .github/workflows/github-analytics-daily.yml | 2 +- tools/analytics/github_analyze.py | 251 ++++++++----------- 2 files changed, 107 insertions(+), 146 deletions(-) diff --git a/.github/workflows/github-analytics-daily.yml b/.github/workflows/github-analytics-daily.yml index ce4c0aebce..504de1bb96 100644 --- a/.github/workflows/github-analytics-daily.yml +++ b/.github/workflows/github-analytics-daily.yml @@ -79,5 +79,5 @@ jobs: --remote origin \ --branch "${BRANCH:-release/2.8}" \ --milestone-id "${MILESTONE:-53}" \ + --missing-in-branch --fork-cut-date 2024-06-24 - --analyze-missing-reverts-from-branch diff --git a/tools/analytics/github_analyze.py b/tools/analytics/github_analyze.py index ccf13d7bde..f984b277d6 100644 --- a/tools/analytics/github_analyze.py +++ b/tools/analytics/github_analyze.py @@ -451,127 +451,11 @@ def print_contributor_stats(commits, delta: Optional[timedelta] = None) -> None: def commits_missing_in_branch( - repo: GitRepo, branch: str, orig_branch: str, milestone_idx: int -) -> None: - def get_commits_dict(x, y): - return build_commit_dict(repo.get_commit_list(x, y)) - - main_commits = get_commits_dict(orig_branch, "main") - release_commits = get_commits_dict(orig_branch, branch) - print(f"len(main_commits)={len(main_commits)}") - print(f"len(release_commits)={len(release_commits)}") - print("URL;Title;Status") - for issue in gh_get_milestone_issues( - "pytorch", "pytorch", milestone_idx, IssueState.ALL - ): - issue_url, state = issue["html_url"], issue["state"] - # Skip closed states if they were landed before merge date - if state == "closed": - mentioned_after_cut = any( - commit.is_issue_mentioned(issue_url) for commit in main_commits.values() - ) - # If issue is not mentioned after cut, that it must be present in release branch - if not mentioned_after_cut: - continue - mentioned_in_release = any( - commit.is_issue_mentioned(issue_url) - for commit in release_commits.values() - ) - # if Issue is mentioned is release branch, than it was picked already - if mentioned_in_release: - continue - print(f'{issue_url};{issue["title"]};{state}') - - -def commits_missing_in_release( repo: GitRepo, branch: str, orig_branch: str, - minor_release: str, milestone_idx: int, - cut_off_date: datetime, - issue_num: int, -) -> None: - def get_commits_dict(x, y): - return build_commit_dict(repo.get_commit_list(x, y)) - - main_commits = get_commits_dict(minor_release, "main") - prev_release_commits = get_commits_dict(orig_branch, branch) - current_issue_comments = get_issue_comments( - "pytorch", "pytorch", issue_num - ) # issue comments for the release tracker as cherry picks - print(f"len(main_commits)={len(main_commits)}") - print(f"len(prev_release_commits)={len(prev_release_commits)}") - print(f"len(current_issue_comments)={len(current_issue_comments)}") - print(f"issue_num: {issue_num}, len(issue_comments)={len(current_issue_comments)}") - print("URL;Title;Status") - - # Iterate over the previous release branch to find potentially missing cherry picks in the current issue. - for commit in prev_release_commits.values(): - not_cherry_picked_in_current_issue = any( - commit.pr_url not in issue_comment["body"] - for issue_comment in current_issue_comments - ) - for main_commit in main_commits.values(): - if main_commit.pr_url == commit.pr_url: - mentioned_after_cut_off_date = cut_off_date < main_commit.commit_date - if not_cherry_picked_in_current_issue and mentioned_after_cut_off_date: - # Commits that are release only, which exist in previous release branch and not in main. - print(f"{commit.pr_url};{commit.title};{commit.commit_date}") - break - - -def analyze_stacks(repo: GitRepo) -> None: - from tqdm.contrib.concurrent import thread_map - - branches = repo.get_ghstack_orig_branches() - stacks_by_author: Dict[str, List[int]] = {} - for branch, rv_commits in thread_map( - lambda x: (x, repo.rev_list(x)), branches, max_workers=10 - ): - author = branch.split("/")[2] - if author not in stacks_by_author: - stacks_by_author[author] = [] - stacks_by_author[author].append(len(rv_commits)) - for author, slen in sorted( - stacks_by_author.items(), key=lambda x: len(x[1]), reverse=True - ): - if len(slen) == 1: - print(f"{author} has 1 stack of depth {slen[0]}") - continue - print( - f"{author} has {len(slen)} stacks max depth is {max(slen)} avg depth is {sum(slen)/len(slen):.2f} mean is {slen[len(slen)//2]}" - ) - - -def extract_commit_hash_from_revert(text): - """ - Extract commit hash from a revert commit message. - - Args: - text (str): The revert commit message - - Returns: - str or None: The extracted commit hash, or None if not found - """ - # Enhanced patterns to match various PyTorch revert formats - patterns = [ - r"This reverts commit ([0-9a-f]{40})\.", # Full 40-char hash - r"This reverts commit ([0-9a-f]{7,40})\.", # Variable length hash - r"reverts commit ([0-9a-f]{7,40})", # Case insensitive - r"Revert.*commit ([0-9a-f]{7,40})", # Flexible revert format - r"Back out.*([0-9a-f]{7,40})", # Back out format - ] - - for pattern in patterns: - match = re.search(pattern, text, re.IGNORECASE) - if match: - return match.group(1) - return None - - -def analyze_reverts_missing_from_branch( - repo: GitRepo, branch: str, fork_cut_date: Optional[datetime] = None + fork_cut_date: Optional[datetime] = None, ) -> None: """ Analyze reverts applied to main branch but not applied to specified branch. @@ -581,6 +465,8 @@ def analyze_reverts_missing_from_branch( Args: repo: GitRepo instance branch: Target branch to analyze (e.g., release branch) + orig_branch: Original branch reference (unused in new implementation) + milestone_idx: Milestone index (unused in new implementation) fork_cut_date: Optional date when fork was created from main """ # Get commits from main that are not in the specified branch @@ -727,6 +613,93 @@ def analyze_reverts_missing_from_branch( print("-" * 80) +def commits_missing_in_release( + repo: GitRepo, + branch: str, + orig_branch: str, + minor_release: str, + milestone_idx: int, + cut_off_date: datetime, + issue_num: int, +) -> None: + def get_commits_dict(x, y): + return build_commit_dict(repo.get_commit_list(x, y)) + + main_commits = get_commits_dict(minor_release, "main") + prev_release_commits = get_commits_dict(orig_branch, branch) + current_issue_comments = get_issue_comments( + "pytorch", "pytorch", issue_num + ) # issue comments for the release tracker as cherry picks + print(f"len(main_commits)={len(main_commits)}") + print(f"len(prev_release_commits)={len(prev_release_commits)}") + print(f"len(current_issue_comments)={len(current_issue_comments)}") + print(f"issue_num: {issue_num}, len(issue_comments)={len(current_issue_comments)}") + print("URL;Title;Status") + + # Iterate over the previous release branch to find potentially missing cherry picks in the current issue. + for commit in prev_release_commits.values(): + not_cherry_picked_in_current_issue = any( + commit.pr_url not in issue_comment["body"] + for issue_comment in current_issue_comments + ) + for main_commit in main_commits.values(): + if main_commit.pr_url == commit.pr_url: + mentioned_after_cut_off_date = cut_off_date < main_commit.commit_date + if not_cherry_picked_in_current_issue and mentioned_after_cut_off_date: + # Commits that are release only, which exist in previous release branch and not in main. + print(f"{commit.pr_url};{commit.title};{commit.commit_date}") + break + + +def analyze_stacks(repo: GitRepo) -> None: + from tqdm.contrib.concurrent import thread_map + + branches = repo.get_ghstack_orig_branches() + stacks_by_author: Dict[str, List[int]] = {} + for branch, rv_commits in thread_map( + lambda x: (x, repo.rev_list(x)), branches, max_workers=10 + ): + author = branch.split("/")[2] + if author not in stacks_by_author: + stacks_by_author[author] = [] + stacks_by_author[author].append(len(rv_commits)) + for author, slen in sorted( + stacks_by_author.items(), key=lambda x: len(x[1]), reverse=True + ): + if len(slen) == 1: + print(f"{author} has 1 stack of depth {slen[0]}") + continue + print( + f"{author} has {len(slen)} stacks max depth is {max(slen)} avg depth is {sum(slen)/len(slen):.2f} mean is {slen[len(slen)//2]}" + ) + + +def extract_commit_hash_from_revert(text): + """ + Extract commit hash from a revert commit message. + + Args: + text (str): The revert commit message + + Returns: + str or None: The extracted commit hash, or None if not found + """ + # Enhanced patterns to match various PyTorch revert formats + patterns = [ + r"This reverts commit ([0-9a-f]{40})\.", # Full 40-char hash + r"This reverts commit ([0-9a-f]{7,40})\.", # Variable length hash + r"reverts commit ([0-9a-f]{7,40})", # Case insensitive + r"Revert.*commit ([0-9a-f]{7,40})", # Flexible revert format + r"Back out.*([0-9a-f]{7,40})", # Back out format + ] + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + return match.group(1) + return None + + def parse_arguments(): from argparse import ArgumentParser @@ -747,11 +720,6 @@ def parse_arguments(): parser.add_argument("--missing-in-branch", action="store_true") parser.add_argument("--missing-in-release", action="store_true") parser.add_argument("--analyze-stacks", action="store_true") - parser.add_argument( - "--analyze-missing-reverts-from-branch", - action="store_true", - help="Analyze reverts applied to main branch but not applied to specified branch", - ) parser.add_argument( "--fork-cut-date", type=lambda d: datetime.strptime(d, "%Y-%m-%d"), @@ -781,32 +749,25 @@ def main(): analyze_stacks(repo) return - if args.analyze_missing_reverts_from_branch: - if not args.branch: - print( - "Error: --branch argument is required for --analyze-missing-reverts-from-branch" - ) - return - fork_cut_date = getattr(args, "fork_cut_date", None) - analyze_reverts_missing_from_branch(repo, args.branch, fork_cut_date) - return - - # Use milestone idx or search it along milestone titles - try: - milestone_idx = int(args.milestone_id) - except ValueError: - milestone_idx = -1 - milestones = gh_get_milestones() - for milestone in milestones: - if milestone.get("title", "") == args.milestone_id: - milestone_idx = int(milestone.get("number", "-2")) - if milestone_idx < 0: - print(f"Could not find milestone {args.milestone_id}") - return + # Use milestone idx or search it along milestone titles (still needed for other functions) + milestone_idx = 0 # Default value for missing_in_branch revert analysis + if args.milestone_id: + try: + milestone_idx = int(args.milestone_id) + except ValueError: + milestone_idx = -1 + milestones = gh_get_milestones() + for milestone in milestones: + if milestone.get("title", "") == args.milestone_id: + milestone_idx = int(milestone.get("number", "-2")) + if milestone_idx < 0: + print(f"Could not find milestone {args.milestone_id}") + return if args.missing_in_branch: + fork_cut_date = getattr(args, "fork_cut_date", None) commits_missing_in_branch( - repo, args.branch, f"orig/{args.branch}", milestone_idx + repo, args.branch, f"orig/{args.branch}", milestone_idx, fork_cut_date ) return From 2093c7d91221dd66373fcbce39f26cff28612b08 Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Fri, 11 Jul 2025 09:37:46 -0700 Subject: [PATCH 5/7] restore original function --- .github/workflows/github-analytics-daily.yml | 1 + tools/analytics/github_analyze.py | 251 +++++++++++-------- 2 files changed, 146 insertions(+), 106 deletions(-) diff --git a/.github/workflows/github-analytics-daily.yml b/.github/workflows/github-analytics-daily.yml index 504de1bb96..2d1ac735b8 100644 --- a/.github/workflows/github-analytics-daily.yml +++ b/.github/workflows/github-analytics-daily.yml @@ -81,3 +81,4 @@ jobs: --milestone-id "${MILESTONE:-53}" \ --missing-in-branch --fork-cut-date 2024-06-24 + --analyze-missing-reverts-from-branch diff --git a/tools/analytics/github_analyze.py b/tools/analytics/github_analyze.py index f984b277d6..ccf13d7bde 100644 --- a/tools/analytics/github_analyze.py +++ b/tools/analytics/github_analyze.py @@ -451,11 +451,127 @@ def print_contributor_stats(commits, delta: Optional[timedelta] = None) -> None: def commits_missing_in_branch( + repo: GitRepo, branch: str, orig_branch: str, milestone_idx: int +) -> None: + def get_commits_dict(x, y): + return build_commit_dict(repo.get_commit_list(x, y)) + + main_commits = get_commits_dict(orig_branch, "main") + release_commits = get_commits_dict(orig_branch, branch) + print(f"len(main_commits)={len(main_commits)}") + print(f"len(release_commits)={len(release_commits)}") + print("URL;Title;Status") + for issue in gh_get_milestone_issues( + "pytorch", "pytorch", milestone_idx, IssueState.ALL + ): + issue_url, state = issue["html_url"], issue["state"] + # Skip closed states if they were landed before merge date + if state == "closed": + mentioned_after_cut = any( + commit.is_issue_mentioned(issue_url) for commit in main_commits.values() + ) + # If issue is not mentioned after cut, that it must be present in release branch + if not mentioned_after_cut: + continue + mentioned_in_release = any( + commit.is_issue_mentioned(issue_url) + for commit in release_commits.values() + ) + # if Issue is mentioned is release branch, than it was picked already + if mentioned_in_release: + continue + print(f'{issue_url};{issue["title"]};{state}') + + +def commits_missing_in_release( repo: GitRepo, branch: str, orig_branch: str, + minor_release: str, milestone_idx: int, - fork_cut_date: Optional[datetime] = None, + cut_off_date: datetime, + issue_num: int, +) -> None: + def get_commits_dict(x, y): + return build_commit_dict(repo.get_commit_list(x, y)) + + main_commits = get_commits_dict(minor_release, "main") + prev_release_commits = get_commits_dict(orig_branch, branch) + current_issue_comments = get_issue_comments( + "pytorch", "pytorch", issue_num + ) # issue comments for the release tracker as cherry picks + print(f"len(main_commits)={len(main_commits)}") + print(f"len(prev_release_commits)={len(prev_release_commits)}") + print(f"len(current_issue_comments)={len(current_issue_comments)}") + print(f"issue_num: {issue_num}, len(issue_comments)={len(current_issue_comments)}") + print("URL;Title;Status") + + # Iterate over the previous release branch to find potentially missing cherry picks in the current issue. + for commit in prev_release_commits.values(): + not_cherry_picked_in_current_issue = any( + commit.pr_url not in issue_comment["body"] + for issue_comment in current_issue_comments + ) + for main_commit in main_commits.values(): + if main_commit.pr_url == commit.pr_url: + mentioned_after_cut_off_date = cut_off_date < main_commit.commit_date + if not_cherry_picked_in_current_issue and mentioned_after_cut_off_date: + # Commits that are release only, which exist in previous release branch and not in main. + print(f"{commit.pr_url};{commit.title};{commit.commit_date}") + break + + +def analyze_stacks(repo: GitRepo) -> None: + from tqdm.contrib.concurrent import thread_map + + branches = repo.get_ghstack_orig_branches() + stacks_by_author: Dict[str, List[int]] = {} + for branch, rv_commits in thread_map( + lambda x: (x, repo.rev_list(x)), branches, max_workers=10 + ): + author = branch.split("/")[2] + if author not in stacks_by_author: + stacks_by_author[author] = [] + stacks_by_author[author].append(len(rv_commits)) + for author, slen in sorted( + stacks_by_author.items(), key=lambda x: len(x[1]), reverse=True + ): + if len(slen) == 1: + print(f"{author} has 1 stack of depth {slen[0]}") + continue + print( + f"{author} has {len(slen)} stacks max depth is {max(slen)} avg depth is {sum(slen)/len(slen):.2f} mean is {slen[len(slen)//2]}" + ) + + +def extract_commit_hash_from_revert(text): + """ + Extract commit hash from a revert commit message. + + Args: + text (str): The revert commit message + + Returns: + str or None: The extracted commit hash, or None if not found + """ + # Enhanced patterns to match various PyTorch revert formats + patterns = [ + r"This reverts commit ([0-9a-f]{40})\.", # Full 40-char hash + r"This reverts commit ([0-9a-f]{7,40})\.", # Variable length hash + r"reverts commit ([0-9a-f]{7,40})", # Case insensitive + r"Revert.*commit ([0-9a-f]{7,40})", # Flexible revert format + r"Back out.*([0-9a-f]{7,40})", # Back out format + ] + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + return match.group(1) + return None + + +def analyze_reverts_missing_from_branch( + repo: GitRepo, branch: str, fork_cut_date: Optional[datetime] = None ) -> None: """ Analyze reverts applied to main branch but not applied to specified branch. @@ -465,8 +581,6 @@ def commits_missing_in_branch( Args: repo: GitRepo instance branch: Target branch to analyze (e.g., release branch) - orig_branch: Original branch reference (unused in new implementation) - milestone_idx: Milestone index (unused in new implementation) fork_cut_date: Optional date when fork was created from main """ # Get commits from main that are not in the specified branch @@ -613,93 +727,6 @@ def commits_missing_in_branch( print("-" * 80) -def commits_missing_in_release( - repo: GitRepo, - branch: str, - orig_branch: str, - minor_release: str, - milestone_idx: int, - cut_off_date: datetime, - issue_num: int, -) -> None: - def get_commits_dict(x, y): - return build_commit_dict(repo.get_commit_list(x, y)) - - main_commits = get_commits_dict(minor_release, "main") - prev_release_commits = get_commits_dict(orig_branch, branch) - current_issue_comments = get_issue_comments( - "pytorch", "pytorch", issue_num - ) # issue comments for the release tracker as cherry picks - print(f"len(main_commits)={len(main_commits)}") - print(f"len(prev_release_commits)={len(prev_release_commits)}") - print(f"len(current_issue_comments)={len(current_issue_comments)}") - print(f"issue_num: {issue_num}, len(issue_comments)={len(current_issue_comments)}") - print("URL;Title;Status") - - # Iterate over the previous release branch to find potentially missing cherry picks in the current issue. - for commit in prev_release_commits.values(): - not_cherry_picked_in_current_issue = any( - commit.pr_url not in issue_comment["body"] - for issue_comment in current_issue_comments - ) - for main_commit in main_commits.values(): - if main_commit.pr_url == commit.pr_url: - mentioned_after_cut_off_date = cut_off_date < main_commit.commit_date - if not_cherry_picked_in_current_issue and mentioned_after_cut_off_date: - # Commits that are release only, which exist in previous release branch and not in main. - print(f"{commit.pr_url};{commit.title};{commit.commit_date}") - break - - -def analyze_stacks(repo: GitRepo) -> None: - from tqdm.contrib.concurrent import thread_map - - branches = repo.get_ghstack_orig_branches() - stacks_by_author: Dict[str, List[int]] = {} - for branch, rv_commits in thread_map( - lambda x: (x, repo.rev_list(x)), branches, max_workers=10 - ): - author = branch.split("/")[2] - if author not in stacks_by_author: - stacks_by_author[author] = [] - stacks_by_author[author].append(len(rv_commits)) - for author, slen in sorted( - stacks_by_author.items(), key=lambda x: len(x[1]), reverse=True - ): - if len(slen) == 1: - print(f"{author} has 1 stack of depth {slen[0]}") - continue - print( - f"{author} has {len(slen)} stacks max depth is {max(slen)} avg depth is {sum(slen)/len(slen):.2f} mean is {slen[len(slen)//2]}" - ) - - -def extract_commit_hash_from_revert(text): - """ - Extract commit hash from a revert commit message. - - Args: - text (str): The revert commit message - - Returns: - str or None: The extracted commit hash, or None if not found - """ - # Enhanced patterns to match various PyTorch revert formats - patterns = [ - r"This reverts commit ([0-9a-f]{40})\.", # Full 40-char hash - r"This reverts commit ([0-9a-f]{7,40})\.", # Variable length hash - r"reverts commit ([0-9a-f]{7,40})", # Case insensitive - r"Revert.*commit ([0-9a-f]{7,40})", # Flexible revert format - r"Back out.*([0-9a-f]{7,40})", # Back out format - ] - - for pattern in patterns: - match = re.search(pattern, text, re.IGNORECASE) - if match: - return match.group(1) - return None - - def parse_arguments(): from argparse import ArgumentParser @@ -720,6 +747,11 @@ def parse_arguments(): parser.add_argument("--missing-in-branch", action="store_true") parser.add_argument("--missing-in-release", action="store_true") parser.add_argument("--analyze-stacks", action="store_true") + parser.add_argument( + "--analyze-missing-reverts-from-branch", + action="store_true", + help="Analyze reverts applied to main branch but not applied to specified branch", + ) parser.add_argument( "--fork-cut-date", type=lambda d: datetime.strptime(d, "%Y-%m-%d"), @@ -749,25 +781,32 @@ def main(): analyze_stacks(repo) return - # Use milestone idx or search it along milestone titles (still needed for other functions) - milestone_idx = 0 # Default value for missing_in_branch revert analysis - if args.milestone_id: - try: - milestone_idx = int(args.milestone_id) - except ValueError: - milestone_idx = -1 - milestones = gh_get_milestones() - for milestone in milestones: - if milestone.get("title", "") == args.milestone_id: - milestone_idx = int(milestone.get("number", "-2")) - if milestone_idx < 0: - print(f"Could not find milestone {args.milestone_id}") - return + if args.analyze_missing_reverts_from_branch: + if not args.branch: + print( + "Error: --branch argument is required for --analyze-missing-reverts-from-branch" + ) + return + fork_cut_date = getattr(args, "fork_cut_date", None) + analyze_reverts_missing_from_branch(repo, args.branch, fork_cut_date) + return + + # Use milestone idx or search it along milestone titles + try: + milestone_idx = int(args.milestone_id) + except ValueError: + milestone_idx = -1 + milestones = gh_get_milestones() + for milestone in milestones: + if milestone.get("title", "") == args.milestone_id: + milestone_idx = int(milestone.get("number", "-2")) + if milestone_idx < 0: + print(f"Could not find milestone {args.milestone_id}") + return if args.missing_in_branch: - fork_cut_date = getattr(args, "fork_cut_date", None) commits_missing_in_branch( - repo, args.branch, f"orig/{args.branch}", milestone_idx, fork_cut_date + repo, args.branch, f"orig/{args.branch}", milestone_idx ) return From 50c31e1c86f02bb842e5a8d8f595006441f09e9d Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Fri, 11 Jul 2025 09:53:38 -0700 Subject: [PATCH 6/7] remove missing in branch --- .github/workflows/github-analytics-daily.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/github-analytics-daily.yml b/.github/workflows/github-analytics-daily.yml index 2d1ac735b8..ce4c0aebce 100644 --- a/.github/workflows/github-analytics-daily.yml +++ b/.github/workflows/github-analytics-daily.yml @@ -79,6 +79,5 @@ jobs: --remote origin \ --branch "${BRANCH:-release/2.8}" \ --milestone-id "${MILESTONE:-53}" \ - --missing-in-branch --fork-cut-date 2024-06-24 --analyze-missing-reverts-from-branch From 2074b47f6fd7ea51909aefd476e7faaa8cb24995 Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Fri, 11 Jul 2025 10:04:19 -0700 Subject: [PATCH 7/7] fix --- .github/workflows/github-analytics-daily.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/github-analytics-daily.yml b/.github/workflows/github-analytics-daily.yml index ce4c0aebce..501189fa39 100644 --- a/.github/workflows/github-analytics-daily.yml +++ b/.github/workflows/github-analytics-daily.yml @@ -68,6 +68,8 @@ jobs: --remote origin \ --branch "${BRANCH:-release/2.8}" \ --analyze-missing-reverts-from-branch + --fork-cut-date 2024-06-24 + - name: Show outstanding milestone issues env: @@ -79,5 +81,3 @@ jobs: --remote origin \ --branch "${BRANCH:-release/2.8}" \ --milestone-id "${MILESTONE:-53}" \ - --fork-cut-date 2024-06-24 - --analyze-missing-reverts-from-branch