Skip to content
This repository was archived by the owner on Jul 16, 2025. It is now read-only.

Commit 5acf9fb

Browse files
Create plugin to compress ATS reports (#154)
Recently we've noticed that JSON reports with labels - the one format accepted by ATS - are simply too big. It is likely that we are experiencing Out Of Memory errors when trying to process them. To put it in perspective, below is a comparison between reports with no labels and with labels for the worker repo full test suite. ``` 918951 bytes (897Kb) May 15 19:45 coverage.xml # this is the regular coverage with no labels 26607438 bytes (25Mb) May 15 19:48 label.coverage.json # this is all labels, stock ``` To avoid processing issues and save bandwidth and all that we will be compressing these reports. The compression idea is simple: we build a mapping label --> int that will substitute the label itself in the report (usually large-ish strings) for a number in the "contexts" of each file. We then include the reverse mapping in the report (int --> label) so that we can easily parse the compressed report when processing it. This is done via the new `compress_pycoverage_contexts` plugin that can be used when uploading with the CLI. Notice that the process is transparent for the user. The results are promissing. Compressing the worker repo (25Mb) yields a new report with 801Kb. About the new requirements. These libs allow streaming large files (smart_open) and stream-parsing the JSON (ijson) This means we don't ever load the full big report in memory when compressing it, avoiding the Out Of Memory error in the plugin as well.
1 parent b5a84df commit 5acf9fb

File tree

8 files changed

+477
-3
lines changed

8 files changed

+477
-3
lines changed

.github/workflows/push_flow.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ jobs:
3939
- name: Install dependencies
4040
run: |
4141
python -m pip install --upgrade pip
42-
python setup.py develop
4342
pip install -r requirements.txt
43+
python setup.py develop
4444
- name: Create commit in codecov
4545
run: |
4646
codecovcli create-commit -t ${{ secrets.CODECOV_TOKEN }} --git-service github

codecov_cli/plugins/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import click
66

7+
from codecov_cli.plugins.compress_pycoverage_contexts import CompressPycoverageContexts
78
from codecov_cli.plugins.gcov import GcovPlugin
89
from codecov_cli.plugins.pycoverage import Pycoverage
910
from codecov_cli.plugins.xcode import XcodePlugin
@@ -54,6 +55,9 @@ def _get_plugin(cli_config, plugin_name):
5455
return Pycoverage(config)
5556
if plugin_name == "xcode":
5657
return XcodePlugin()
58+
if plugin_name == "compress-pycoverage":
59+
config = cli_config.get("plugins", {}).get("compress-pycoverage", {})
60+
return CompressPycoverageContexts(config)
5761
if cli_config and plugin_name in cli_config.get("plugins", {}):
5862
return _load_plugin_from_yaml(cli_config["plugins"][plugin_name])
5963
click.secho(f"Unable to find plugin {plugin_name}", fg="magenta", err=True)
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
import json
2+
import logging
3+
import pathlib
4+
from decimal import Decimal
5+
from typing import Any, List
6+
7+
import ijson
8+
from smart_open import open
9+
10+
from codecov_cli.plugins.types import PreparationPluginReturn
11+
12+
logger = logging.getLogger("codecovcli")
13+
14+
15+
class Encoder(json.JSONEncoder):
16+
def default(self, o: Any) -> Any:
17+
if isinstance(o, Decimal):
18+
return str(o)
19+
return super().default(o)
20+
21+
22+
class CompressPycoverageContextsConfig(dict):
23+
@property
24+
def file_to_compress(self) -> pathlib.Path:
25+
"""
26+
The report file to compress.
27+
file_to_compress: Union[str, pathlib.Path] [default coverage.json]
28+
"""
29+
return pathlib.Path(self.get("file_to_compress", "coverage.json"))
30+
31+
@property
32+
def delete_uncompressed(self) -> bool:
33+
"""
34+
Flag indicating to delete the original file after compressing.
35+
Recommended to avoid uploading the uncompressed file.
36+
delete_uncompressed: bool [default True]
37+
"""
38+
return self.get("delete_uncompressed", True)
39+
40+
41+
class CompressPycoverageContexts(object):
42+
def __init__(self, config: dict = None) -> None:
43+
if config is None:
44+
config = {}
45+
self.config = CompressPycoverageContextsConfig(config)
46+
self.file_to_compress = self.config.file_to_compress
47+
self.file_to_write = pathlib.Path(
48+
str(self.file_to_compress).replace(".json", "") + ".codecov.json"
49+
)
50+
51+
def run_preparation(self, collector) -> PreparationPluginReturn:
52+
if not self.file_to_compress.exists():
53+
logger.warning(
54+
f"File to compress {self.file_to_compress} not found. Aborting"
55+
)
56+
return PreparationPluginReturn(
57+
success=False,
58+
messages=[f"File to compress {self.file_to_compress} not found."],
59+
)
60+
if not self.file_to_compress.is_file():
61+
logger.warning(
62+
f"File to compress {self.file_to_compress} is not a file. Aborting"
63+
)
64+
return PreparationPluginReturn(
65+
success=False,
66+
messages=[f"File to compress {self.file_to_compress} is not a file."],
67+
)
68+
# Create in and out streams
69+
fd_in = open(self.file_to_compress, "rb")
70+
fd_out = open(self.file_to_write, "w")
71+
# Compress the file
72+
fd_out.write("{")
73+
self._copy_meta(fd_in, fd_out)
74+
files_in_report = ijson.kvitems(fd_in, "files")
75+
self._compress_files(files_in_report, fd_out)
76+
fd_out.write("}")
77+
# Close streams
78+
fd_in.close()
79+
fd_out.close()
80+
logger.info(f"Compressed report written to {self.file_to_write}")
81+
# Delete original file if needed
82+
if self.config.delete_uncompressed:
83+
logger.info(f"Deleting file {self.file_to_compress}")
84+
self.file_to_compress.unlink()
85+
return PreparationPluginReturn(success=True, messages=[])
86+
87+
def _compress_files(self, files_in_report, fd_out) -> None:
88+
"""
89+
Compress the 'files' entry in the coverage data.
90+
This is done by creating a labels table [str -> int] mapping labels to an index.
91+
This index then substitutes the label itself in the contexts
92+
"""
93+
labels_table = {}
94+
nxt_idx = 0
95+
96+
fd_out.write('"files":{')
97+
for file_name, file_coverage_details in files_in_report:
98+
self._copy_file_details(file_name, file_coverage_details, fd_out)
99+
fd_out.write('"contexts": {')
100+
contexts = file_coverage_details["contexts"]
101+
for line_number, labels in contexts.items():
102+
fd_out.write(f'"{line_number}":')
103+
new_labels = []
104+
for label in labels:
105+
stripped_label = label.split("|")[0] # removes '|run' from label
106+
if stripped_label not in labels_table:
107+
labels_table[stripped_label] = nxt_idx
108+
nxt_idx += 1
109+
new_labels.append(labels_table[stripped_label])
110+
fd_out.write(json.dumps(new_labels))
111+
# fd_out.write(self._bitmask_label_indexes(new_labels))
112+
fd_out.write(",")
113+
if len(contexts): # Avoid removing '{' if contexts == {}
114+
# Because there will be an extra ',' after the last line
115+
fd_out.seek(fd_out.tell() - 1)
116+
# One curly brace for the 'contexts', one for the file_name
117+
fd_out.write("}},")
118+
# Because there will be an extra ',' after the last file_name
119+
fd_out.seek(fd_out.tell() - 1)
120+
fd_out.write("},")
121+
# Save the inverted index of labels table in the report
122+
# So when we are processing the result we have int -> label
123+
fd_out.write(
124+
f'"labels_table": {json.dumps({ value: key for key, value in labels_table.items() })}'
125+
)
126+
127+
def _copy_file_details(self, file_name, file_details, fd_out) -> None:
128+
fd_out.write(f'"{file_name}":{{')
129+
fd_out.write(f'"executed_lines": {file_details["executed_lines"]},')
130+
fd_out.write(f'"summary": {json.dumps(file_details["summary"], cls=Encoder)},')
131+
fd_out.write(f'"missing_lines": {file_details["missing_lines"]},')
132+
fd_out.write(f'"excluded_lines": {file_details["excluded_lines"]},')
133+
134+
def _copy_meta(self, fd_in, fd_out) -> None:
135+
meta = ijson.kvitems(fd_in, "")
136+
for key, value in meta:
137+
if key == "files":
138+
continue
139+
fd_out.write(f'"{key}": {json.dumps(value, cls=Encoder)},')
140+
fd_in.seek(0)

requirements.in

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@ pytest-asyncio
66
pyyaml
77
responses
88
httpx
9-
tree_sitter
9+
tree_sitter
10+
ijson
11+
smart-open

requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ idna==3.3
3030
# anyio
3131
# requests
3232
# rfc3986
33+
ijson==3.2.0.post0
34+
# via -r requirements.in
3335
iniconfig==1.1.1
3436
# via pytest
3537
packaging==21.3
@@ -60,6 +62,8 @@ responses==0.21.0
6062
# via -r requirements.in
6163
rfc3986[idna2008]==1.5.0
6264
# via httpx
65+
smart-open==6.3.0
66+
# via -r requirements.in
6367
sniffio==1.3.0
6468
# via
6569
# anyio

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
long_description_content_type='text/markdown',
1818
author="Codecov",
1919
author_email="[email protected]",
20-
install_requires=["click", "requests", "PyYAML", "tree_sitter", "httpx"],
20+
install_requires=["click", "requests", "PyYAML", "tree_sitter", "httpx", "pytest", "pytest-cov", "ijson", "smart-open"],
2121
entry_points={
2222
"console_scripts": [
2323
"codecovcli = codecov_cli.main:run",

0 commit comments

Comments
 (0)