Skip to content

Commit 1b4a424

Browse files
committed
Add temporary LLVM CI crawling scripts
1 parent 26f0894 commit 1b4a424

File tree

4 files changed

+482
-11
lines changed

4 files changed

+482
-11
lines changed

filter_candidates.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import argparse
2+
import collections
3+
import json
4+
import tempfile
5+
import os
6+
import subprocess
7+
8+
9+
def main(args: argparse.Namespace):
10+
11+
with open(args.json_file) as f:
12+
raw_data = json.load(f)
13+
data = collections.defaultdict(list)
14+
for base, diff in raw_data:
15+
data[base].append(diff)
16+
17+
with open(args.log_file) as f:
18+
log_lines = f.read().splitlines()
19+
20+
base_builds = {}
21+
diff_builds = collections.defaultdict(list)
22+
23+
i = 0
24+
while i < len(log_lines):
25+
assert not log_lines[i].startswith(" ")
26+
base, builds = log_lines[i].split()
27+
base_builds[base] = builds == "True"
28+
i += 1
29+
while i < len(log_lines) and log_lines[i].startswith(" "):
30+
_, builds = log_lines[i].strip().split()
31+
diff_builds[base].append(builds == "True")
32+
i += 1
33+
34+
filtered = collections.defaultdict(list)
35+
36+
for base, builds in base_builds.items():
37+
if not builds:
38+
continue
39+
40+
for diff_index, builds in enumerate(diff_builds[base]):
41+
if builds:
42+
continue
43+
44+
# def display_diff() -> None:
45+
# subprocess.Popen(
46+
# ["python3", "-m", "ydiff"], text=True, stdin=subprocess.PIPE
47+
# ).communicate(data[base][diff_index])
48+
49+
# display_diff()
50+
# while answer := input("Does this seem reasonable? [y/n] ") not in [
51+
# "y",
52+
# "n",
53+
# ]:
54+
# display_diff()
55+
56+
# if answer == "n":
57+
# continue
58+
59+
filtered[base].append(data[base][diff_index])
60+
61+
with open(args.output_log_file, "w") as f:
62+
json.dump(filtered, f)
63+
64+
65+
if __name__ == "__main__":
66+
parser = argparse.ArgumentParser()
67+
parser.add_argument("--json-file", type=str, default="commits.json")
68+
parser.add_argument("--log-file", type=str, default="commits.log")
69+
parser.add_argument("--output-log-file", type=str, default="commits-filtered.log")
70+
main(parser.parse_args())

github-llvm.py

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
import math
2+
import json
3+
import os
4+
import requests
5+
import subprocess
6+
import tempfile
7+
8+
BASE = "https://api.github.com"
9+
TOKEN = os.getenv("GITHUB_PAT", None)
10+
HEADERS = {
11+
"Accept": "application/vnd.github+json",
12+
"X-GitHub-Api-Version": "2022-11-28",
13+
**({"Authorization": f"Bearer {TOKEN}"} if TOKEN else {}),
14+
}
15+
16+
17+
class LogGroup:
18+
def __init__(self, name: str | None = None):
19+
self.name = name
20+
self.content: list[str | LogGroup] = []
21+
22+
def append_line(self, line: str) -> None:
23+
self.content.append(line.strip())
24+
25+
def append_child(self, group: "LogGroup") -> None:
26+
self.content.append(group)
27+
28+
def __str__(self) -> str:
29+
def _str_helper(group: LogGroup, indent: int) -> str:
30+
result = ""
31+
indent_str = " " * indent
32+
result += f"{indent_str}Group: {group.name or '[unnamed]'}\n"
33+
for item in group.content:
34+
if isinstance(item, LogGroup):
35+
result += _str_helper(item, indent + 1)
36+
else:
37+
result += f"{indent_str} {item}\n"
38+
return result
39+
40+
return _str_helper(self, 0)
41+
42+
def get_child_group(self, name: str) -> "LogGroup | None":
43+
for item in self.content:
44+
if isinstance(item, LogGroup) and item.name == name:
45+
return item
46+
return None
47+
48+
def __iter__(self):
49+
for item in self.content:
50+
if isinstance(item, LogGroup):
51+
yield from item
52+
else:
53+
yield item
54+
55+
56+
def group_job_log(log: str) -> LogGroup:
57+
def _recursive_helper(lines: list[str], index: int, group: LogGroup) -> int:
58+
while index < len(lines):
59+
line = lines[index]
60+
line = line[line.find(" ") + 1 :] # Remove timestamp.
61+
if line.startswith("##[group]"):
62+
child = LogGroup(line[len("##[group]") :].strip() or None)
63+
index = _recursive_helper(lines, index + 1, child)
64+
group.append_child(child)
65+
elif line.startswith("##[endgroup]"):
66+
return index + 1
67+
else:
68+
group.append_line(line)
69+
index += 1
70+
return index
71+
72+
root = LogGroup()
73+
_recursive_helper(log.splitlines(), 0, root)
74+
return root
75+
76+
77+
# https://docs.github.com/en/rest/actions/workflow-jobs?apiVersion=2022-11-28#download-job-logs-for-a-workflow-run
78+
def get_workflow_job_logs(repo: str, job_id: int) -> str:
79+
response = requests.get(
80+
f"{BASE}/repos/{repo}/actions/jobs/{job_id}/logs", headers=HEADERS
81+
)
82+
return response.text
83+
84+
85+
# https://docs.github.com/en/rest/actions/workflow-jobs?apiVersion=2022-11-28#list-jobs-for-a-workflow-run
86+
def get_workflow_jobs(repo: str, run_id: int):
87+
response = requests.get(
88+
f"{BASE}/repos/{repo}/actions/runs/{run_id}/jobs",
89+
headers=HEADERS,
90+
params={"per_page": 100, "filter": "latest"},
91+
)
92+
assert "link" not in response.headers # No support for pagination.
93+
return response.json()["jobs"]
94+
95+
96+
# https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28#list-workflow-runs-for-a-repository
97+
def get_workflow_runs(
98+
repo: str,
99+
event: str | None = None,
100+
status: str | None = None,
101+
pagination: int = 100,
102+
):
103+
total_count = math.inf # 2025-10: Max we can get is 1000 entries anyway.
104+
i = 0
105+
while i < total_count:
106+
response = requests.get(
107+
f"{BASE}/repos/{repo}/actions/runs",
108+
headers=HEADERS,
109+
params={
110+
**({"event": event} if event else {}),
111+
**({"status": status} if status else {}),
112+
"per_page": pagination,
113+
"page": i // pagination + 1,
114+
},
115+
)
116+
data = response.json()
117+
total_count = data["total_count"]
118+
i += len(data["workflow_runs"])
119+
for run in data["workflow_runs"]:
120+
yield run
121+
122+
123+
# We now want to clone the original repo, add the other repo as a remote.
124+
# Then fetch the commit, and find the merge base.
125+
126+
# 1. git clone https://github.com/llvm/llvm-project
127+
# 2. git rev-parse --verify HEAD
128+
# 3. git remote add XXX/llvm-project https://github.com/XXX/llvm-project.git
129+
# 4. git fetch XXX/llvm-project YYY
130+
# 5. git merge-base HEAD YYY
131+
# 6. git diff MERGE_BASE..YYY
132+
133+
134+
def get_bases_and_diffs(
135+
repo: str, head_repos_and_shas: list[tuple[str, str]]
136+
) -> list[tuple[str, str]]:
137+
bases_and_diffs = []
138+
139+
with tempfile.TemporaryDirectory() as directory:
140+
subprocess.run(
141+
["git", "clone", f"https://github.com/{repo}.git", directory],
142+
check=True,
143+
)
144+
head = subprocess.check_output(
145+
["git", "rev-parse", "--verify", "HEAD"],
146+
cwd=directory,
147+
text=True,
148+
).strip()
149+
150+
for head_repo, head_sha in head_repos_and_shas:
151+
remotes = [
152+
line.split()[0]
153+
for line in subprocess.check_output(
154+
["git", "remote", "-v"],
155+
cwd=directory,
156+
text=True,
157+
)
158+
.strip()
159+
.splitlines()
160+
]
161+
if head_repo not in remotes:
162+
subprocess.run(
163+
[
164+
"git",
165+
"remote",
166+
"add",
167+
head_repo,
168+
f"https://github.com/{head_repo}.git",
169+
],
170+
cwd=directory,
171+
check=True,
172+
)
173+
174+
fetch_process = subprocess.run(
175+
["git", "fetch", head_repo, head_sha], cwd=directory
176+
)
177+
if fetch_process.returncode != 0:
178+
# Commit or repository deleted.
179+
continue
180+
181+
merge_base = subprocess.check_output(
182+
["git", "merge-base", head, head_sha],
183+
cwd=directory,
184+
text=True,
185+
).strip()
186+
187+
diff = subprocess.check_output(
188+
["git", "diff", f"{merge_base}..{head_sha}"],
189+
cwd=directory,
190+
text=True,
191+
)
192+
193+
bases_and_diffs.append((merge_base, diff))
194+
return bases_and_diffs
195+
196+
197+
# Workflow runs can be "failed" with no "failed" jobs, e.g. when workflow file is invalid.
198+
# Jobs can be "failed" with no "failed" steps, e.g. when runner crashes.
199+
200+
entries = []
201+
202+
for i, run in enumerate(
203+
get_workflow_runs("llvm/llvm-project", event="pull_request", status="failure")
204+
):
205+
if run["path"] != ".github/workflows/premerge.yaml":
206+
continue
207+
jobs = get_workflow_jobs("llvm/llvm-project", run["id"])
208+
for job in jobs:
209+
if job["name"] != "Build and Test Linux":
210+
continue
211+
if job["conclusion"] != "failure":
212+
continue
213+
214+
logs = get_workflow_job_logs("llvm/llvm-project", job["id"])
215+
216+
log = group_job_log(logs).get_child_group("CMake")
217+
if not log:
218+
continue
219+
log = log.get_child_group("ninja")
220+
if not log:
221+
continue
222+
223+
log = str(log)
224+
# Find a line with "] Building CXX" followed by a line with "FAILED: ".
225+
lines = log.splitlines()
226+
for j in range(len(lines) - 1):
227+
if "] Building CXX" in lines[j] and "FAILED: " in lines[j + 1]:
228+
print("Run:", i)
229+
print(f"Repository: {run['repository']['full_name']}")
230+
print(f"Head repository: {run['head_repository']['full_name']}")
231+
print("Commit:", run["head_sha"])
232+
print("URL:", job["html_url"])
233+
print()
234+
235+
entries.append(
236+
(
237+
run["head_repository"]["full_name"],
238+
run["head_sha"],
239+
)
240+
)
241+
break
242+
243+
data = get_bases_and_diffs("llvm/llvm-project", entries)
244+
print("Retrieved", len(entries), "failed CI commits...")
245+
print("Retrieved", len(data), "actual commits...")
246+
with open("commits.json", "w") as f:
247+
json.dump(data, f)

0 commit comments

Comments
 (0)