|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import argparse |
| 4 | +import csv |
| 5 | +import logging |
| 6 | +import os |
| 7 | +import re |
| 8 | +import sys |
| 9 | +import urllib.request |
| 10 | +from datetime import datetime |
| 11 | + |
| 12 | +from extract_archived_include_analysis import extract_include_analysis |
| 13 | +from include_analysis import IncludeAnalysisOutput, ParseError, parse_raw_include_analysis_output |
| 14 | +from suggest_include_changes import filter_filenames |
| 15 | + |
| 16 | +CHROMIUM_INCLUDE_ANALYSIS_BASE_URL = "https://commondatastorage.googleapis.com/chromium-browser-clang" |
| 17 | +HREF_REGEX = re.compile(r"<a href=\"(.*?)\">", re.DOTALL) |
| 18 | +FILENAME_DATE_REGEX = re.compile(r"chrome_includes_(\d+-\d+-\d+_\d+)") |
| 19 | + |
| 20 | + |
| 21 | +class IncludeAnalysisOutputWithUrl(IncludeAnalysisOutput): |
| 22 | + url: str |
| 23 | + |
| 24 | + |
| 25 | +def extract_include_analysis_list() -> list[str]: |
| 26 | + include_analysis_archive_response = urllib.request.urlopen( |
| 27 | + f"{CHROMIUM_INCLUDE_ANALYSIS_BASE_URL}/chrome_includes-index.html" |
| 28 | + ) |
| 29 | + archive_html = include_analysis_archive_response.read().decode("utf8") |
| 30 | + |
| 31 | + return list( |
| 32 | + map( |
| 33 | + lambda relative_url: f"{CHROMIUM_INCLUDE_ANALYSIS_BASE_URL}/{relative_url}", |
| 34 | + re.findall(HREF_REGEX, archive_html), |
| 35 | + ) |
| 36 | + ) |
| 37 | + |
| 38 | + |
| 39 | +def get_archived_include_analysis(analysis_url: str) -> IncludeAnalysisOutputWithUrl: |
| 40 | + include_analysis_response = urllib.request.urlopen(analysis_url) |
| 41 | + include_analysis_contents = include_analysis_response.read().decode("utf8") |
| 42 | + |
| 43 | + include_analysis_json = extract_include_analysis(include_analysis_contents) |
| 44 | + |
| 45 | + if not include_analysis_json: |
| 46 | + raise RuntimeError(f"Could not extract include analysis from {analysis_url}") |
| 47 | + |
| 48 | + # The URL is not included in the JSON, so we add it here so it can be in the output |
| 49 | + include_analysis = parse_raw_include_analysis_output(include_analysis_json) |
| 50 | + include_analysis["url"] = analysis_url |
| 51 | + |
| 52 | + return include_analysis |
| 53 | + |
| 54 | + |
| 55 | +def parse_include_analysis_date(analysis_date: str) -> datetime: |
| 56 | + if analysis_date.endswith(" UTC"): |
| 57 | + analysis_date = analysis_date[:-4] |
| 58 | + |
| 59 | + return datetime.fromisoformat(analysis_date) |
| 60 | + |
| 61 | + |
| 62 | +def include_analysis_diff( |
| 63 | + include_analysis: IncludeAnalysisOutput, |
| 64 | + min_edge_size: int, |
| 65 | + increase_percentage_threshold: int, |
| 66 | + increase_from_zero_threshold: int, |
| 67 | +): |
| 68 | + analysis_date = parse_include_analysis_date(include_analysis["date"]) |
| 69 | + |
| 70 | + flagged_edges = set() |
| 71 | + |
| 72 | + analysis_list = extract_include_analysis_list() |
| 73 | + analysis_filename_prefix = f"{CHROMIUM_INCLUDE_ANALYSIS_BASE_URL}/chrome_includes_{analysis_date.year}-{analysis_date.month:02d}-{analysis_date.day:02d}" |
| 74 | + |
| 75 | + # Find index of the provided analysis in case it is not the most recent |
| 76 | + analysis_idx = -1 |
| 77 | + |
| 78 | + # Unfortunately the embedded date is not the same as the filename date, |
| 79 | + # they appear to differ by some amount of seconds, but the filename |
| 80 | + # always has the later timestamp, and the analysis runs are several |
| 81 | + # hours apart, so only check the prefix for the correct hour and the |
| 82 | + # next one as well to account for rollover into the next hour |
| 83 | + for idx, url in enumerate(analysis_list): |
| 84 | + if url.startswith(f"{analysis_filename_prefix}_{analysis_date.hour:02d}") or url.startswith( |
| 85 | + f"{analysis_filename_prefix}_{(analysis_date.hour + 1):02d}" |
| 86 | + ): |
| 87 | + analysis_idx = idx |
| 88 | + break |
| 89 | + |
| 90 | + if analysis_idx == -1: |
| 91 | + raise RuntimeError("Could not find the analysis in the archive list") |
| 92 | + |
| 93 | + # Gather previous analyses to compare to if they exist: |
| 94 | + # * Immediately previous analysis |
| 95 | + # * At least one week previous |
| 96 | + # * At least 30 days previous |
| 97 | + previous_analyses = {} |
| 98 | + |
| 99 | + # First get the immediately previous analysis |
| 100 | + immediately_previous_analysis = get_archived_include_analysis(analysis_list[analysis_idx + 1]) |
| 101 | + previous_analysis_date = parse_include_analysis_date(immediately_previous_analysis["date"]) |
| 102 | + delta = analysis_date - previous_analysis_date |
| 103 | + previous_analyses[delta.days] = immediately_previous_analysis |
| 104 | + |
| 105 | + # Look for previous week and previous 30 days |
| 106 | + for min_days_delta in (7, 30): |
| 107 | + for previous_analysis_url in analysis_list: |
| 108 | + match = FILENAME_DATE_REGEX.search(previous_analysis_url) |
| 109 | + if match is None: |
| 110 | + raise RuntimeError(f"Could not parse date from URL: {previous_analysis_url}") |
| 111 | + |
| 112 | + # Determine the analysis date from the filename |
| 113 | + previous_analysis_date = datetime.strptime(match.group(1).strip(), "%Y-%m-%d_%H%M%S") |
| 114 | + delta = analysis_date - previous_analysis_date |
| 115 | + |
| 116 | + if delta.days >= min_days_delta: |
| 117 | + # This has already been covered, e.g, previous analysis was already that many days ago |
| 118 | + if delta.days in previous_analyses: |
| 119 | + break |
| 120 | + |
| 121 | + previous_analyses[delta.days] = get_archived_include_analysis(previous_analysis_url) |
| 122 | + break |
| 123 | + |
| 124 | + # Filter out anything that isn't direct Chromium code |
| 125 | + filenames = filter_filenames( |
| 126 | + include_analysis["files"], |
| 127 | + filter_generated_files=True, |
| 128 | + filter_mojom_headers=True, |
| 129 | + filter_third_party=True, |
| 130 | + ) |
| 131 | + |
| 132 | + for previous_analysis in previous_analyses.values(): |
| 133 | + for filename in filenames: |
| 134 | + try: |
| 135 | + previous_size = previous_analysis["asizes"][filename] |
| 136 | + except KeyError: |
| 137 | + # New file |
| 138 | + previous_size = 0 |
| 139 | + |
| 140 | + current_size = include_analysis["asizes"][filename] |
| 141 | + difference = current_size - previous_size |
| 142 | + flag_node = False |
| 143 | + |
| 144 | + # Flag the file itself, not just an edge, if it has a significant increase |
| 145 | + if previous_size == 0: |
| 146 | + flag_node = difference >= increase_from_zero_threshold |
| 147 | + elif current_size > min_edge_size: |
| 148 | + increase_percentage = difference / float(previous_size) |
| 149 | + flag_node = increase_percentage >= increase_percentage_threshold / 100.0 |
| 150 | + |
| 151 | + if flag_node: |
| 152 | + yield ( |
| 153 | + previous_analysis["url"], |
| 154 | + previous_analysis["revision"], |
| 155 | + previous_analysis["date"], |
| 156 | + filename, |
| 157 | + "", |
| 158 | + str(difference), |
| 159 | + ) |
| 160 | + |
| 161 | + for header in include_analysis["esizes"][filename]: |
| 162 | + # Only consider the most recent increase if it was flagged |
| 163 | + if (filename, header) in flagged_edges: |
| 164 | + continue |
| 165 | + |
| 166 | + try: |
| 167 | + previous_size = previous_analysis["esizes"][filename][header] |
| 168 | + except KeyError: |
| 169 | + # New edge |
| 170 | + previous_size = 0 |
| 171 | + |
| 172 | + current_size = include_analysis["esizes"][filename][header] |
| 173 | + |
| 174 | + # To cut down on noise, skip edges which are too small to care about |
| 175 | + if current_size < min_edge_size: |
| 176 | + continue |
| 177 | + |
| 178 | + difference = current_size - previous_size |
| 179 | + |
| 180 | + # A lot of edges are zero so a percentage increase isn't applicable, |
| 181 | + # and instead we use an absolute increase in size - otherwise percentage |
| 182 | + if previous_size == 0: |
| 183 | + flag_edge = difference >= increase_from_zero_threshold |
| 184 | + else: |
| 185 | + increase_percentage = difference / float(previous_size) |
| 186 | + flag_edge = increase_percentage >= increase_percentage_threshold / 100.0 |
| 187 | + |
| 188 | + if flag_edge: |
| 189 | + flagged_edges.add((filename, header)) |
| 190 | + yield ( |
| 191 | + previous_analysis["url"], |
| 192 | + previous_analysis["revision"], |
| 193 | + previous_analysis["date"], |
| 194 | + filename, |
| 195 | + header, |
| 196 | + str(difference), |
| 197 | + ) |
| 198 | + |
| 199 | + |
| 200 | +def main(): |
| 201 | + parser = argparse.ArgumentParser( |
| 202 | + description="Analyze differences between an include analysis output and previous ones" |
| 203 | + ) |
| 204 | + parser.add_argument( |
| 205 | + "include_analysis_output", |
| 206 | + type=argparse.FileType("r"), |
| 207 | + nargs="?", |
| 208 | + help="The include analysis output to use.", |
| 209 | + ) |
| 210 | + parser.add_argument( |
| 211 | + "--min-edge-size", |
| 212 | + type=int, |
| 213 | + help="Minimum edge size in MB before flagging any increase.", |
| 214 | + default=75, |
| 215 | + ) |
| 216 | + parser.add_argument( |
| 217 | + "--increase-percentage-threshold", |
| 218 | + type=int, |
| 219 | + help="Increase percentage threshold before flagging increase. 0-100.", |
| 220 | + default=50, |
| 221 | + ) |
| 222 | + parser.add_argument( |
| 223 | + "--increase-from-zero-threshold", |
| 224 | + type=int, |
| 225 | + help="Increase in MB threshold before flagging an increase from a previously zero-sized edge.", |
| 226 | + default=75, |
| 227 | + ) |
| 228 | + group = parser.add_mutually_exclusive_group() |
| 229 | + group.add_argument("--quiet", action="store_true", default=False, help="Only log warnings and errors.") |
| 230 | + group.add_argument("--verbose", action="store_true", default=False, help="Enable verbose logging.") |
| 231 | + args = parser.parse_args() |
| 232 | + |
| 233 | + logging.basicConfig( |
| 234 | + format="%(asctime)s - %(levelname)s - %(message)s", |
| 235 | + level=logging.DEBUG if args.verbose else logging.WARNING if args.quiet else logging.INFO, |
| 236 | + ) |
| 237 | + |
| 238 | + # If the user specified an include analysis output file, use that instead of fetching it |
| 239 | + if args.include_analysis_output: |
| 240 | + raw_include_analysis = args.include_analysis_output.read() |
| 241 | + else: |
| 242 | + include_analysis_response = urllib.request.urlopen( |
| 243 | + "https://commondatastorage.googleapis.com/chromium-browser-clang/include-analysis.js" |
| 244 | + ) |
| 245 | + raw_include_analysis = include_analysis_response.read().decode("utf8") |
| 246 | + |
| 247 | + try: |
| 248 | + include_analysis = parse_raw_include_analysis_output(raw_include_analysis) |
| 249 | + except ParseError as e: |
| 250 | + message = str(e) |
| 251 | + print("error: Could not parse include analysis output file") |
| 252 | + if message: |
| 253 | + print(message) |
| 254 | + return 2 |
| 255 | + |
| 256 | + csv_writer = csv.writer(sys.stdout) |
| 257 | + |
| 258 | + try: |
| 259 | + for row in include_analysis_diff( |
| 260 | + include_analysis, |
| 261 | + args.min_edge_size * 1024 * 1024, |
| 262 | + args.increase_percentage_threshold, |
| 263 | + args.increase_from_zero_threshold * 1024 * 1024, |
| 264 | + ): |
| 265 | + csv_writer.writerow(row) |
| 266 | + |
| 267 | + sys.stdout.flush() |
| 268 | + except BrokenPipeError: |
| 269 | + devnull = os.open(os.devnull, os.O_WRONLY) |
| 270 | + os.dup2(devnull, sys.stdout.fileno()) |
| 271 | + sys.exit(1) |
| 272 | + |
| 273 | + return 0 |
| 274 | + |
| 275 | + |
| 276 | +if __name__ == "__main__": |
| 277 | + try: |
| 278 | + sys.exit(main()) |
| 279 | + except KeyboardInterrupt: |
| 280 | + pass # Don't show the user anything |
0 commit comments