Skip to content

Commit 76a25d9

Browse files
committed
feat: include_analysis_diff.py script
1 parent 8d5f453 commit 76a25d9

File tree

3 files changed

+296
-4
lines changed

3 files changed

+296
-4
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@ Scripts to help guide cleanup of #include lines in a codebase, using `clangd`
66

77
* `apply_include_changes.py` - Apply include changes to files in the source
88
tree
9+
* `extract_archived_include_analysis.py` - Extract archived include analysis JSON
910
* `filter_include_changes.py` - Filter include changes output
11+
* `include_analysis_diff.py` - Analyze differences between an include analysis
12+
output and previous ones
1013
* `list_includers.py` - List includers of a file
1114
* `list_transitive_includes.py` - List transitive (and direct) includes of a file
1215
* `post_process_compilation_db.py` - Post-process the clang compilation

extract_archived_include_analysis.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,15 @@
1010
DATA_REGEX = re.compile(r".*<script>\n?(data = .*?)<\/script>", re.DOTALL)
1111

1212

13+
def extract_include_analysis(contents: str) -> str:
14+
data_match = DATA_REGEX.match(contents)
15+
16+
if data_match:
17+
return data_match.group(1).strip()
18+
19+
return ""
20+
21+
1322
def main():
1423
parser = argparse.ArgumentParser(description="Extract archived include analysis JSON")
1524
parser.add_argument("include_analysis_url", help="The include analysis output URL to extract.")
@@ -21,11 +30,11 @@ def main():
2130

2231
contents = urllib.request.urlopen(args.include_analysis_url).read()
2332

24-
data_match = DATA_REGEX.match(contents.decode("utf-8"))
25-
2633
try:
27-
if data_match:
28-
print(data_match.group(1).strip())
34+
include_analysis = extract_include_analysis(contents.decode("utf-8"))
35+
36+
if include_analysis:
37+
print(include_analysis)
2938

3039
sys.stdout.flush()
3140
except BrokenPipeError:

include_analysis_diff.py

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import csv
5+
import logging
6+
import os
7+
import re
8+
import sys
9+
import urllib.request
10+
from datetime import datetime
11+
12+
from extract_archived_include_analysis import extract_include_analysis
13+
from include_analysis import IncludeAnalysisOutput, ParseError, parse_raw_include_analysis_output
14+
from suggest_include_changes import filter_filenames
15+
16+
CHROMIUM_INCLUDE_ANALYSIS_BASE_URL = "https://commondatastorage.googleapis.com/chromium-browser-clang"
17+
HREF_REGEX = re.compile(r"<a href=\"(.*?)\">", re.DOTALL)
18+
FILENAME_DATE_REGEX = re.compile(r"chrome_includes_(\d+-\d+-\d+_\d+)")
19+
20+
21+
class IncludeAnalysisOutputWithUrl(IncludeAnalysisOutput):
22+
url: str
23+
24+
25+
def extract_include_analysis_list() -> list[str]:
26+
include_analysis_archive_response = urllib.request.urlopen(
27+
f"{CHROMIUM_INCLUDE_ANALYSIS_BASE_URL}/chrome_includes-index.html"
28+
)
29+
archive_html = include_analysis_archive_response.read().decode("utf8")
30+
31+
return list(
32+
map(
33+
lambda relative_url: f"{CHROMIUM_INCLUDE_ANALYSIS_BASE_URL}/{relative_url}",
34+
re.findall(HREF_REGEX, archive_html),
35+
)
36+
)
37+
38+
39+
def get_archived_include_analysis(analysis_url: str) -> IncludeAnalysisOutputWithUrl:
40+
include_analysis_response = urllib.request.urlopen(analysis_url)
41+
include_analysis_contents = include_analysis_response.read().decode("utf8")
42+
43+
include_analysis_json = extract_include_analysis(include_analysis_contents)
44+
45+
if not include_analysis_json:
46+
raise RuntimeError(f"Could not extract include analysis from {analysis_url}")
47+
48+
# The URL is not included in the JSON, so we add it here so it can be in the output
49+
include_analysis = parse_raw_include_analysis_output(include_analysis_json)
50+
include_analysis["url"] = analysis_url
51+
52+
return include_analysis
53+
54+
55+
def parse_include_analysis_date(analysis_date: str) -> datetime:
56+
if analysis_date.endswith(" UTC"):
57+
analysis_date = analysis_date[:-4]
58+
59+
return datetime.fromisoformat(analysis_date)
60+
61+
62+
def include_analysis_diff(
63+
include_analysis: IncludeAnalysisOutput,
64+
min_edge_size: int,
65+
increase_percentage_threshold: int,
66+
increase_from_zero_threshold: int,
67+
):
68+
analysis_date = parse_include_analysis_date(include_analysis["date"])
69+
70+
flagged_edges = set()
71+
72+
analysis_list = extract_include_analysis_list()
73+
analysis_filename_prefix = f"{CHROMIUM_INCLUDE_ANALYSIS_BASE_URL}/chrome_includes_{analysis_date.year}-{analysis_date.month:02d}-{analysis_date.day:02d}"
74+
75+
# Find index of the provided analysis in case it is not the most recent
76+
analysis_idx = -1
77+
78+
# Unfortunately the embedded date is not the same as the filename date,
79+
# they appear to differ by some amount of seconds, but the filename
80+
# always has the later timestamp, and the analysis runs are several
81+
# hours apart, so only check the prefix for the correct hour and the
82+
# next one as well to account for rollover into the next hour
83+
for idx, url in enumerate(analysis_list):
84+
if url.startswith(f"{analysis_filename_prefix}_{analysis_date.hour:02d}") or url.startswith(
85+
f"{analysis_filename_prefix}_{(analysis_date.hour + 1):02d}"
86+
):
87+
analysis_idx = idx
88+
break
89+
90+
if analysis_idx == -1:
91+
raise RuntimeError("Could not find the analysis in the archive list")
92+
93+
# Gather previous analyses to compare to if they exist:
94+
# * Immediately previous analysis
95+
# * At least one week previous
96+
# * At least 30 days previous
97+
previous_analyses = {}
98+
99+
# First get the immediately previous analysis
100+
immediately_previous_analysis = get_archived_include_analysis(analysis_list[analysis_idx + 1])
101+
previous_analysis_date = parse_include_analysis_date(immediately_previous_analysis["date"])
102+
delta = analysis_date - previous_analysis_date
103+
previous_analyses[delta.days] = immediately_previous_analysis
104+
105+
# Look for previous week and previous 30 days
106+
for min_days_delta in (7, 30):
107+
for previous_analysis_url in analysis_list:
108+
match = FILENAME_DATE_REGEX.search(previous_analysis_url)
109+
if match is None:
110+
raise RuntimeError(f"Could not parse date from URL: {previous_analysis_url}")
111+
112+
# Determine the analysis date from the filename
113+
previous_analysis_date = datetime.strptime(match.group(1).strip(), "%Y-%m-%d_%H%M%S")
114+
delta = analysis_date - previous_analysis_date
115+
116+
if delta.days >= min_days_delta:
117+
# This has already been covered, e.g, previous analysis was already that many days ago
118+
if delta.days in previous_analyses:
119+
break
120+
121+
previous_analyses[delta.days] = get_archived_include_analysis(previous_analysis_url)
122+
break
123+
124+
# Filter out anything that isn't direct Chromium code
125+
filenames = filter_filenames(
126+
include_analysis["files"],
127+
filter_generated_files=True,
128+
filter_mojom_headers=True,
129+
filter_third_party=True,
130+
)
131+
132+
for previous_analysis in previous_analyses.values():
133+
for filename in filenames:
134+
try:
135+
previous_size = previous_analysis["asizes"][filename]
136+
except KeyError:
137+
# New file
138+
previous_size = 0
139+
140+
current_size = include_analysis["asizes"][filename]
141+
difference = current_size - previous_size
142+
flag_node = False
143+
144+
# Flag the file itself, not just an edge, if it has a significant increase
145+
if previous_size == 0:
146+
flag_node = difference >= increase_from_zero_threshold
147+
elif current_size > min_edge_size:
148+
increase_percentage = difference / float(previous_size)
149+
flag_node = increase_percentage >= increase_percentage_threshold / 100.0
150+
151+
if flag_node:
152+
yield (
153+
previous_analysis["url"],
154+
previous_analysis["revision"],
155+
previous_analysis["date"],
156+
filename,
157+
"",
158+
str(difference),
159+
)
160+
161+
for header in include_analysis["esizes"][filename]:
162+
# Only consider the most recent increase if it was flagged
163+
if (filename, header) in flagged_edges:
164+
continue
165+
166+
try:
167+
previous_size = previous_analysis["esizes"][filename][header]
168+
except KeyError:
169+
# New edge
170+
previous_size = 0
171+
172+
current_size = include_analysis["esizes"][filename][header]
173+
174+
# To cut down on noise, skip edges which are too small to care about
175+
if current_size < min_edge_size:
176+
continue
177+
178+
difference = current_size - previous_size
179+
180+
# A lot of edges are zero so a percentage increase isn't applicable,
181+
# and instead we use an absolute increase in size - otherwise percentage
182+
if previous_size == 0:
183+
flag_edge = difference >= increase_from_zero_threshold
184+
else:
185+
increase_percentage = difference / float(previous_size)
186+
flag_edge = increase_percentage >= increase_percentage_threshold / 100.0
187+
188+
if flag_edge:
189+
flagged_edges.add((filename, header))
190+
yield (
191+
previous_analysis["url"],
192+
previous_analysis["revision"],
193+
previous_analysis["date"],
194+
filename,
195+
header,
196+
str(difference),
197+
)
198+
199+
200+
def main():
201+
parser = argparse.ArgumentParser(
202+
description="Analyze differences between an include analysis output and previous ones"
203+
)
204+
parser.add_argument(
205+
"include_analysis_output",
206+
type=argparse.FileType("r"),
207+
nargs="?",
208+
help="The include analysis output to use.",
209+
)
210+
parser.add_argument(
211+
"--min-edge-size",
212+
type=int,
213+
help="Minimum edge size in MB before flagging any increase.",
214+
default=75,
215+
)
216+
parser.add_argument(
217+
"--increase-percentage-threshold",
218+
type=int,
219+
help="Increase percentage threshold before flagging increase. 0-100.",
220+
default=50,
221+
)
222+
parser.add_argument(
223+
"--increase-from-zero-threshold",
224+
type=int,
225+
help="Increase in MB threshold before flagging an increase from a previously zero-sized edge.",
226+
default=75,
227+
)
228+
group = parser.add_mutually_exclusive_group()
229+
group.add_argument("--quiet", action="store_true", default=False, help="Only log warnings and errors.")
230+
group.add_argument("--verbose", action="store_true", default=False, help="Enable verbose logging.")
231+
args = parser.parse_args()
232+
233+
logging.basicConfig(
234+
format="%(asctime)s - %(levelname)s - %(message)s",
235+
level=logging.DEBUG if args.verbose else logging.WARNING if args.quiet else logging.INFO,
236+
)
237+
238+
# If the user specified an include analysis output file, use that instead of fetching it
239+
if args.include_analysis_output:
240+
raw_include_analysis = args.include_analysis_output.read()
241+
else:
242+
include_analysis_response = urllib.request.urlopen(
243+
"https://commondatastorage.googleapis.com/chromium-browser-clang/include-analysis.js"
244+
)
245+
raw_include_analysis = include_analysis_response.read().decode("utf8")
246+
247+
try:
248+
include_analysis = parse_raw_include_analysis_output(raw_include_analysis)
249+
except ParseError as e:
250+
message = str(e)
251+
print("error: Could not parse include analysis output file")
252+
if message:
253+
print(message)
254+
return 2
255+
256+
csv_writer = csv.writer(sys.stdout)
257+
258+
try:
259+
for row in include_analysis_diff(
260+
include_analysis,
261+
args.min_edge_size * 1024 * 1024,
262+
args.increase_percentage_threshold,
263+
args.increase_from_zero_threshold * 1024 * 1024,
264+
):
265+
csv_writer.writerow(row)
266+
267+
sys.stdout.flush()
268+
except BrokenPipeError:
269+
devnull = os.open(os.devnull, os.O_WRONLY)
270+
os.dup2(devnull, sys.stdout.fileno())
271+
sys.exit(1)
272+
273+
return 0
274+
275+
276+
if __name__ == "__main__":
277+
try:
278+
sys.exit(main())
279+
except KeyboardInterrupt:
280+
pass # Don't show the user anything

0 commit comments

Comments
 (0)