4545import re
4646import subprocess
4747import sys
48- from typing import Generator , List , NamedTuple , Iterable
48+ from typing import Dict , Generator , Iterable , List , NamedTuple , Optional , Tuple
4949
5050assert sys .version_info >= (3 , 6 ), "Only Python 3.6+ is supported."
5151
5252# People are creative with their reverts, and heuristics are a bit difficult.
53- # Like 90% of of reverts have "This reverts commit ${full_sha}".
54- # Some lack that entirely, while others have many of them specified in ad-hoc
55- # ways, while others use short SHAs and whatever.
53+ # At a glance, most reverts have "This reverts commit ${full_sha}". Many others
54+ # have `Reverts llvm/llvm-project#${PR_NUMBER}`.
5655#
57- # The 90% case is trivial to handle (and 100% free + automatic). The extra 10%
58- # starts involving human intervention, which is probably not worth it for now.
56+ # By their powers combined, we should be able to automatically catch something
57+ # like 80% of reverts with reasonable confidence. At some point, human
58+ # intervention will always be required (e.g., I saw
59+ # ```
60+ # This reverts commit ${commit_sha_1} and
61+ # also ${commit_sha_2_shorthand}
62+ # ```
63+ # during my sample)
64+
65+ _CommitMessageReverts = NamedTuple (
66+ "_CommitMessageReverts" ,
67+ [
68+ ("potential_shas" , List [str ]),
69+ ("potential_pr_numbers" , List [int ]),
70+ ],
71+ )
72+
5973
74+ def _try_parse_reverts_from_commit_message (
75+ commit_message : str ,
76+ ) -> _CommitMessageReverts :
77+ """Tries to parse revert SHAs and LLVM PR numbers form the commit message.
6078
61- def _try_parse_reverts_from_commit_message (commit_message : str ) -> List [str ]:
79+ Returns:
80+ A namedtuple containing:
81+ - A list of potentially reverted SHAs
82+ - A list of potentially reverted LLVM PR numbers
83+ """
6284 if not commit_message :
63- return []
85+ return _CommitMessageReverts ([], [])
6486
65- results = re .findall (r"This reverts commit ([a-f0-9]{40})\b" , commit_message )
87+ sha_reverts = re .findall (
88+ r"This reverts commit ([a-f0-9]{40})\b" ,
89+ commit_message ,
90+ )
6691
6792 first_line = commit_message .splitlines ()[0 ]
6893 initial_revert = re .match (r'Revert ([a-f0-9]{6,}) "' , first_line )
6994 if initial_revert :
70- results .append (initial_revert .group (1 ))
71- return results
95+ sha_reverts .append (initial_revert .group (1 ))
7296
97+ pr_numbers = [
98+ int (x )
99+ for x in re .findall (
100+ r"Reverts llvm/llvm-project#(\d+)" ,
101+ commit_message ,
102+ )
103+ ]
104+
105+ return _CommitMessageReverts (
106+ potential_shas = sha_reverts ,
107+ potential_pr_numbers = pr_numbers ,
108+ )
73109
74- def _stream_stdout (command : List [str ]) -> Generator [str , None , None ]:
110+
111+ def _stream_stdout (
112+ command : List [str ], cwd : Optional [str ] = None
113+ ) -> Generator [str , None , None ]:
75114 with subprocess .Popen (
76- command , stdout = subprocess .PIPE , encoding = "utf-8" , errors = "replace"
115+ command ,
116+ cwd = cwd ,
117+ stdout = subprocess .PIPE ,
118+ encoding = "utf-8" ,
119+ errors = "replace" ,
77120 ) as p :
78121 assert p .stdout is not None # for mypy's happiness.
79122 yield from p .stdout
@@ -175,10 +218,43 @@ def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str:
175218 ).strip ()
176219
177220
178- def find_reverts (git_dir : str , across_ref : str , root : str ) -> List [Revert ]:
221+ def _load_pr_commit_mappings (
222+ git_dir : str , root : str , min_ref : str
223+ ) -> Dict [int , List [str ]]:
224+ git_log = ["git" , "log" , "--format=%H %s" , f"{ min_ref } ..{ root } " ]
225+ results = collections .defaultdict (list )
226+ pr_regex = re .compile (r"\s\(#(\d+)\)$" )
227+ for line in _stream_stdout (git_log , cwd = git_dir ):
228+ m = pr_regex .search (line )
229+ if not m :
230+ continue
231+
232+ pr_number = int (m .group (1 ))
233+ sha = line .split (None , 1 )[0 ]
234+ # N.B., these are kept in log (read: reverse chronological) order,
235+ # which is what's expected by `find_reverts`.
236+ results [pr_number ].append (sha )
237+ return results
238+
239+
240+ # N.B., max_pr_lookback's default of 20K commits is arbitrary, but should be
241+ # enough for the 99% case of reverts: rarely should someone land a cleanish
242+ # revert of a >6 month old change...
243+ def find_reverts (
244+ git_dir : str , across_ref : str , root : str , max_pr_lookback : int = 20000
245+ ) -> List [Revert ]:
179246 """Finds reverts across `across_ref` in `git_dir`, starting from `root`.
180247
181248 These reverts are returned in order of oldest reverts first.
249+
250+ Args:
251+ git_dir: git directory to find reverts in.
252+ across_ref: the ref to find reverts across.
253+ root: the 'main' ref to look for reverts on.
254+ max_pr_lookback: this function uses heuristics to map PR numbers to
255+ SHAs. These heuristics require that commit history from `root` to
256+ `some_parent_of_root` is loaded in memory. `max_pr_lookback` is how
257+ many commits behind `across_ref` should be loaded in memory.
182258 """
183259 across_sha = _rev_parse (git_dir , across_ref )
184260 root_sha = _rev_parse (git_dir , root )
@@ -201,8 +277,41 @@ def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]:
201277 )
202278
203279 all_reverts = []
280+ # Lazily load PR <-> commit mappings, since it can be expensive.
281+ pr_commit_mappings = None
204282 for sha , commit_message in _log_stream (git_dir , root_sha , across_sha ):
205- reverts = _try_parse_reverts_from_commit_message (commit_message )
283+ reverts , pr_reverts = _try_parse_reverts_from_commit_message (
284+ commit_message ,
285+ )
286+ if pr_reverts :
287+ if pr_commit_mappings is None :
288+ logging .info (
289+ "Loading PR <-> commit mappings. This may take a moment..."
290+ )
291+ pr_commit_mappings = _load_pr_commit_mappings (
292+ git_dir , root_sha , f"{ across_sha } ~{ max_pr_lookback } "
293+ )
294+ logging .info (
295+ "Loaded %d PR <-> commit mappings" , len (pr_commit_mappings )
296+ )
297+
298+ for reverted_pr_number in pr_reverts :
299+ reverted_shas = pr_commit_mappings .get (reverted_pr_number )
300+ if not reverted_shas :
301+ logging .warning (
302+ "No SHAs for reverted PR %d (commit %s)" ,
303+ reverted_pr_number ,
304+ sha ,
305+ )
306+ continue
307+ logging .debug (
308+ "Inferred SHAs %s for reverted PR %d (commit %s)" ,
309+ reverted_shas ,
310+ reverted_pr_number ,
311+ sha ,
312+ )
313+ reverts .extend (reverted_shas )
314+
206315 if not reverts :
207316 continue
208317
0 commit comments