Skip to content

Commit 27de922

Browse files
committed
feat(find_edges_to_cut): script to help find interesting edges to cut
1 parent 491ac3f commit 27de922

File tree

1 file changed

+283
-0
lines changed

1 file changed

+283
-0
lines changed

find_edges_to_cut.py

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import csv
5+
import logging
6+
import os
7+
import sys
8+
from typing import List, Optional, Set, Tuple
9+
10+
import networkx as nx
11+
12+
from cut_header import compute_added_sizes
13+
from include_analysis import IncludeAnalysisOutput, ParseError, load_include_analysis
14+
from utils import create_graph_from_include_analysis
15+
16+
17+
def create_include_graph(
18+
include_analysis: IncludeAnalysisOutput,
19+
skips: Optional[Tuple[Tuple[str, str]]],
20+
) -> nx.DiGraph:
21+
DG: nx.DiGraph = create_graph_from_include_analysis(include_analysis)
22+
files = include_analysis["files"]
23+
24+
if skips:
25+
for includer, included in skips:
26+
if includer in files and included in files:
27+
includer_idx = files.index(includer)
28+
included_idx = files.index(included)
29+
30+
if DG.has_edge(includer_idx, included_idx):
31+
DG.remove_edge(includer_idx, included_idx)
32+
else:
33+
logging.warning(f"Skip edge {includer} -> {included} not found in include graph")
34+
else:
35+
logging.warning(f"Skip edge {includer} -> {included} not found in include analysis")
36+
37+
return DG
38+
39+
40+
def find_entry_points(
41+
include_analysis: IncludeAnalysisOutput,
42+
DG: nx.DiGraph,
43+
subset: Set[str],
44+
) -> List[str]:
45+
"""Find entry points into a subset of nodes.
46+
47+
An entry point is a node in the subset that is still reachable from
48+
at least one root after all edges *between* nodes in the subset have
49+
been removed. This identifies which subset nodes are the first to
50+
appear on paths from roots into the subset cluster.
51+
"""
52+
files = include_analysis["files"]
53+
file_idx_lookup = {filename: idx for idx, filename in enumerate(files)}
54+
subset_indices = {file_idx_lookup[f] for f in subset if f in file_idx_lookup}
55+
56+
DG2 = DG.copy()
57+
58+
# Remove all edges between nodes in the subset
59+
edges_to_remove = []
60+
for u in subset_indices:
61+
for _, v in DG2.out_edges(u):
62+
if v in subset_indices:
63+
edges_to_remove.append((u, v))
64+
for v, _ in DG2.in_edges(u):
65+
if v in subset_indices:
66+
edges_to_remove.append((v, u))
67+
68+
DG2.remove_edges_from(edges_to_remove)
69+
70+
# For each root, find which subset nodes are still reachable
71+
entry_points = set()
72+
73+
root_indices = set()
74+
for root in include_analysis["roots"]:
75+
if root in file_idx_lookup:
76+
root_indices.add(file_idx_lookup[root])
77+
78+
for root_idx in root_indices:
79+
reachable = nx.descendants(DG2, root_idx)
80+
reachable_subset = reachable & subset_indices
81+
entry_points.update(reachable_subset)
82+
83+
return sorted(files[idx] for idx in entry_points)
84+
85+
86+
def find_top_edges(
87+
include_analysis: IncludeAnalysisOutput,
88+
subset: Set[str],
89+
top_n: int = 10,
90+
ignores: Optional[Set[Tuple[str, str]]] = None,
91+
) -> List[Tuple[str, str, float]]:
92+
"""Find the top N edges between nodes inside the subset, ranked by prevalence.
93+
94+
Returns a list of (includer, included, prevalence) tuples sorted by
95+
prevalence descending.
96+
"""
97+
root_count = len(include_analysis["roots"])
98+
edges = []
99+
100+
for included in subset:
101+
for includer in include_analysis["included_by"].get(included, []):
102+
if includer not in subset:
103+
continue
104+
if ignores and (includer, included) in ignores:
105+
continue
106+
prevalence = (100.0 * include_analysis["prevalence"][includer]) / root_count
107+
edges.append((includer, included, prevalence))
108+
109+
# Sort by prevalence descending, take top N
110+
edges.sort(key=lambda x: x[2], reverse=True)
111+
return edges[:top_n]
112+
113+
114+
def find_top_edges_by_dominators(
115+
include_analysis: IncludeAnalysisOutput,
116+
subset: Set[str],
117+
dominators: dict,
118+
top_n: int = 10,
119+
ignores: Optional[Set[Tuple[str, str]]] = None,
120+
) -> List[Tuple[str, str, float, int]]:
121+
"""Find the top N edges between nodes inside the subset, ranked by dominator count.
122+
123+
Returns a list of (includer, included, prevalence, dominator_count) tuples
124+
sorted by dominator count descending.
125+
"""
126+
root_count = len(include_analysis["roots"])
127+
edges = []
128+
129+
for included in subset:
130+
for includer in include_analysis["included_by"].get(included, []):
131+
if includer not in subset:
132+
continue
133+
if ignores and (includer, included) in ignores:
134+
continue
135+
prevalence = (100.0 * include_analysis["prevalence"][includer]) / root_count
136+
dom_count = dominators.get((includer, included), 0)
137+
edges.append((includer, included, prevalence, dom_count))
138+
139+
# Sort by dominator count descending, take top N
140+
edges.sort(key=lambda x: x[3], reverse=True)
141+
return edges[:top_n]
142+
143+
144+
# Adapted from analyze_includes.py in Chromium
145+
def compute_doms(DG: nx.DiGraph, roots):
146+
# Give each node a size of 1 to represent one file
147+
sizes = {data["filename"]: 1 for _, data in DG.nodes(data=True) if "filename" in data}
148+
149+
# Split each src -> dst edge in includes into src -> (src,dst) -> dst, so that
150+
# we can compute how much each include graph edge adds to the size by doing
151+
# dominance analysis on the (src,dst) nodes.
152+
augmented_includes = {}
153+
for src_node_id, src_data in DG.nodes(data=True):
154+
if "filename" not in src_data:
155+
continue
156+
157+
src = src_data["filename"]
158+
if src not in augmented_includes:
159+
augmented_includes[src] = set()
160+
161+
for dst_node_id in DG.successors(src_node_id):
162+
dst = DG.nodes(data=True)[dst_node_id]["filename"]
163+
augmented_includes[src].add((src, dst))
164+
augmented_includes[(src, dst)] = {dst}
165+
166+
return compute_added_sizes((roots, augmented_includes, sizes))
167+
168+
169+
def main():
170+
parser = argparse.ArgumentParser(
171+
description="Find entry points and top edges for high-prevalence headers in the include graph."
172+
)
173+
parser.add_argument(
174+
"include_analysis_output",
175+
type=str,
176+
nargs="?",
177+
help="The include analysis output to use (can be a file path or URL). If not specified, pulls the latest.",
178+
)
179+
parser.add_argument("--skips", action="append", default=[], help="CSV files of edges to skip (remove from graph).")
180+
parser.add_argument("--ignores", action="append", default=[], help="CSV files of edges to ignore.")
181+
parser.add_argument(
182+
"--min-prevalence",
183+
type=float,
184+
required=True,
185+
help="Minimum prevalence percentage for a node to be in the subset.",
186+
)
187+
parser.add_argument(
188+
"--top", type=int, default=10, help="Number of top edges to output by prevalence (default: 10)."
189+
)
190+
parser.add_argument("--verbose", action="store_true", default=False, help="Enable verbose logging.")
191+
args = parser.parse_args()
192+
193+
logging.basicConfig(
194+
format="%(asctime)s - %(levelname)s - %(message)s",
195+
level=logging.DEBUG if args.verbose else logging.WARNING,
196+
)
197+
198+
try:
199+
include_analysis = load_include_analysis(args.include_analysis_output)
200+
except ParseError as e:
201+
message = str(e)
202+
print("error: Could not parse include analysis output file")
203+
if message:
204+
print(message)
205+
return 2
206+
207+
root_count = len(include_analysis["roots"])
208+
209+
# Load skips and ignores from CSV files
210+
skips: Set[Tuple[str, str]] = set()
211+
ignores: Set[Tuple[str, str]] = set()
212+
213+
for skips_file in args.skips:
214+
with open(skips_file, "r", newline="") as f:
215+
skips.update(
216+
[tuple(row) for row in csv.reader(f) if row and row[0].strip() and not row[0].startswith("#")]
217+
)
218+
219+
for ignores_file in args.ignores:
220+
with open(ignores_file, "r", newline="") as f:
221+
ignores.update([tuple(row) for row in csv.reader(f) if row])
222+
223+
# Build the subset: filter out generated/system/third-party headers and keep those meeting minimum prevalence
224+
EXCLUDED_PREFIXES = ("out/", "buildtools/", "build/", "third_party/", "v8/")
225+
EXCLUDED_EXCEPTIONS = ("third_party/blink/",)
226+
subset: Set[str] = set()
227+
228+
for filename in include_analysis["files"]:
229+
if filename.startswith(EXCLUDED_PREFIXES) and not filename.startswith(EXCLUDED_EXCEPTIONS):
230+
continue
231+
232+
prevalence = (100.0 * include_analysis["prevalence"].get(filename, 0)) / root_count
233+
234+
if prevalence >= args.min_prevalence:
235+
subset.add(filename)
236+
237+
logging.info(f"Subset size: {len(subset)} nodes with >= {args.min_prevalence:.2f}% prevalence")
238+
239+
if not subset:
240+
print(f"No nodes meet the minimum prevalence of {args.min_prevalence:.2f}%", file=sys.stderr)
241+
return 0
242+
243+
DG: nx.DiGraph = create_include_graph(include_analysis, skips)
244+
245+
entry_points = find_entry_points(include_analysis, DG, subset)
246+
dominators = compute_doms(
247+
DG.subgraph([include_analysis["files"].index(node) for node in subset]).copy(), entry_points
248+
)
249+
250+
# Find and output top N edges by prevalence
251+
top_edges = find_top_edges(include_analysis, subset, top_n=args.top, ignores=ignores)
252+
253+
# Find top N edges by dominator count
254+
top_edges_by_doms = find_top_edges_by_dominators(
255+
include_analysis, subset, dominators, top_n=args.top, ignores=ignores
256+
)
257+
258+
print(f"Top {args.top} edges by prevalence:", file=sys.stderr)
259+
260+
try:
261+
csv_writer = csv.writer(sys.stdout)
262+
for includer, included, prevalence in top_edges:
263+
csv_writer.writerow([includer, included, f"{prevalence:.2f}", dominators.get((includer, included), 0)])
264+
265+
print(f"\nTop {args.top} edges by dominator count:", file=sys.stderr)
266+
267+
for includer, included, prevalence, dom_count in top_edges_by_doms:
268+
csv_writer.writerow([includer, included, f"{prevalence:.2f}", dom_count])
269+
270+
sys.stdout.flush()
271+
except BrokenPipeError:
272+
devnull = os.open(os.devnull, os.O_WRONLY)
273+
os.dup2(devnull, sys.stdout.fileno())
274+
sys.exit(1)
275+
276+
return 0
277+
278+
279+
if __name__ == "__main__":
280+
try:
281+
sys.exit(main())
282+
except KeyboardInterrupt:
283+
pass # Don't show the user anything

0 commit comments

Comments
 (0)