Create plugin to compress ATS reports (#154)

giovanni-guidini · web-flow · commit 5acf9fbf8d49 · 2023-05-19T11:06:56.000+02:00
Recently we've noticed that JSON reports with labels - the one format accepted by ATS -
are simply too big. It is likely that we are experiencing Out Of Memory errors when
trying to process them.

To put it in perspective, below is a comparison between reports with no labels and
with labels for the worker repo full test suite.
```
  918951 bytes (897Kb) May 15 19:45 coverage.xml # this is the regular coverage with no labels
26607438 bytes (25Mb)  May 15 19:48 label.coverage.json # this is all labels, stock
```

To avoid processing issues and save bandwidth and all that we will be compressing these reports.
The compression idea is simple: we build a mapping label --&gt; int that will substitute the label itself
in the report (usually large-ish strings) for a number in the "contexts" of each file.
We then include the reverse mapping in the report (int --&gt; label) so that we can easily parse
the compressed report when processing it.

This is done via the new `compress_pycoverage_contexts` plugin that can be used when uploading
with the CLI. Notice that the process is transparent for the user.
The results are promissing. Compressing the worker repo (25Mb) yields a new report with 801Kb.

About the new requirements.
These libs allow streaming large files (smart_open) and stream-parsing the JSON (ijson)
This means we don't ever load the full big report in memory when compressing it,
avoiding the Out Of Memory error in the plugin as well.
diff --git a/.github/workflows/push_flow.yml b/.github/workflows/push_flow.yml
@@ -39,8 +39,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python setup.py develop
         pip install -r requirements.txt
+        python setup.py develop
     - name: Create commit in codecov
       run: |
         codecovcli create-commit -t ${{ secrets.CODECOV_TOKEN }} --git-service github
diff --git a/codecov_cli/plugins/__init__.py b/codecov_cli/plugins/__init__.py
@@ -4,6 +4,7 @@
 
 import click
 
+from codecov_cli.plugins.compress_pycoverage_contexts import CompressPycoverageContexts
 from codecov_cli.plugins.gcov import GcovPlugin
 from codecov_cli.plugins.pycoverage import Pycoverage
 from codecov_cli.plugins.xcode import XcodePlugin
@@ -54,6 +55,9 @@ def _get_plugin(cli_config, plugin_name):
         return Pycoverage(config)
     if plugin_name == "xcode":
         return XcodePlugin()
+    if plugin_name == "compress-pycoverage":
+        config = cli_config.get("plugins", {}).get("compress-pycoverage", {})
+        return CompressPycoverageContexts(config)
     if cli_config and plugin_name in cli_config.get("plugins", {}):
         return _load_plugin_from_yaml(cli_config["plugins"][plugin_name])
     click.secho(f"Unable to find plugin {plugin_name}", fg="magenta", err=True)
diff --git a/codecov_cli/plugins/compress_pycoverage_contexts.py b/codecov_cli/plugins/compress_pycoverage_contexts.py
@@ -0,0 +1,140 @@
+import json
+import logging
+import pathlib
+from decimal import Decimal
+from typing import Any, List
+
+import ijson
+from smart_open import open
+
+from codecov_cli.plugins.types import PreparationPluginReturn
+
+logger = logging.getLogger("codecovcli")
+
+
+class Encoder(json.JSONEncoder):
+    def default(self, o: Any) -> Any:
+        if isinstance(o, Decimal):
+            return str(o)
+        return super().default(o)
+
+
+class CompressPycoverageContextsConfig(dict):
+    @property
+    def file_to_compress(self) -> pathlib.Path:
+        """
+        The report file to compress.
+        file_to_compress: Union[str, pathlib.Path] [default coverage.json]
+        """
+        return pathlib.Path(self.get("file_to_compress", "coverage.json"))
+
+    @property
+    def delete_uncompressed(self) -> bool:
+        """
+        Flag indicating to delete the original file after compressing.
+        Recommended to avoid uploading the uncompressed file.
+        delete_uncompressed: bool [default True]
+        """
+        return self.get("delete_uncompressed", True)
+
+
+class CompressPycoverageContexts(object):
+    def __init__(self, config: dict = None) -> None:
+        if config is None:
+            config = {}
+        self.config = CompressPycoverageContextsConfig(config)
+        self.file_to_compress = self.config.file_to_compress
+        self.file_to_write = pathlib.Path(
+            str(self.file_to_compress).replace(".json", "") + ".codecov.json"
+        )
+
+    def run_preparation(self, collector) -> PreparationPluginReturn:
+        if not self.file_to_compress.exists():
+            logger.warning(
+                f"File to compress {self.file_to_compress} not found. Aborting"
+            )
+            return PreparationPluginReturn(
+                success=False,
+                messages=[f"File to compress {self.file_to_compress} not found."],
+            )
+        if not self.file_to_compress.is_file():
+            logger.warning(
+                f"File to compress {self.file_to_compress} is not a file. Aborting"
+            )
+            return PreparationPluginReturn(
+                success=False,
+                messages=[f"File to compress {self.file_to_compress} is not a file."],
+            )
+        # Create in and out streams
+        fd_in = open(self.file_to_compress, "rb")
+        fd_out = open(self.file_to_write, "w")
+        # Compress the file
+        fd_out.write("{")
+        self._copy_meta(fd_in, fd_out)
+        files_in_report = ijson.kvitems(fd_in, "files")
+        self._compress_files(files_in_report, fd_out)
+        fd_out.write("}")
+        # Close streams
+        fd_in.close()
+        fd_out.close()
+        logger.info(f"Compressed report written to {self.file_to_write}")
+        # Delete original file if needed
+        if self.config.delete_uncompressed:
+            logger.info(f"Deleting file {self.file_to_compress}")
+            self.file_to_compress.unlink()
+        return PreparationPluginReturn(success=True, messages=[])
+
+    def _compress_files(self, files_in_report, fd_out) -> None:
+        """
+        Compress the 'files' entry in the coverage data.
+        This is done by creating a labels table [str -> int] mapping labels to an index.
+        This index then substitutes the label itself in the contexts
+        """
+        labels_table = {}
+        nxt_idx = 0
+
+        fd_out.write('"files":{')
+        for file_name, file_coverage_details in files_in_report:
+            self._copy_file_details(file_name, file_coverage_details, fd_out)
+            fd_out.write('"contexts": {')
+            contexts = file_coverage_details["contexts"]
+            for line_number, labels in contexts.items():
+                fd_out.write(f'"{line_number}":')
+                new_labels = []
+                for label in labels:
+                    stripped_label = label.split("|")[0]  # removes '|run' from label
+                    if stripped_label not in labels_table:
+                        labels_table[stripped_label] = nxt_idx
+                        nxt_idx += 1
+                    new_labels.append(labels_table[stripped_label])
+                fd_out.write(json.dumps(new_labels))
+                # fd_out.write(self._bitmask_label_indexes(new_labels))
+                fd_out.write(",")
+            if len(contexts):  # Avoid removing '{' if contexts == {}
+                # Because there will be an extra ',' after the last line
+                fd_out.seek(fd_out.tell() - 1)
+            # One curly brace for the 'contexts', one for the file_name
+            fd_out.write("}},")
+        # Because there will be an extra ',' after the last file_name
+        fd_out.seek(fd_out.tell() - 1)
+        fd_out.write("},")
+        # Save the inverted index of labels table in the report
+        # So when we are processing the result we have int -> label
+        fd_out.write(
+            f'"labels_table": {json.dumps({ value: key for key, value in labels_table.items() })}'
+        )
+
+    def _copy_file_details(self, file_name, file_details, fd_out) -> None:
+        fd_out.write(f'"{file_name}":{{')
+        fd_out.write(f'"executed_lines": {file_details["executed_lines"]},')
+        fd_out.write(f'"summary": {json.dumps(file_details["summary"], cls=Encoder)},')
+        fd_out.write(f'"missing_lines": {file_details["missing_lines"]},')
+        fd_out.write(f'"excluded_lines": {file_details["excluded_lines"]},')
+
+    def _copy_meta(self, fd_in, fd_out) -> None:
+        meta = ijson.kvitems(fd_in, "")
+        for key, value in meta:
+            if key == "files":
+                continue
+            fd_out.write(f'"{key}": {json.dumps(value, cls=Encoder)},')
+        fd_in.seek(0)
diff --git a/requirements.in b/requirements.in
@@ -6,4 +6,6 @@ pytest-asyncio
 pyyaml
 responses
 httpx
-tree_sitter
+tree_sitter
+ijson
+smart-open
diff --git a/requirements.txt b/requirements.txt
@@ -30,6 +30,8 @@ idna==3.3
     #   anyio
     #   requests
     #   rfc3986
+ijson==3.2.0.post0
+    # via -r requirements.in
 iniconfig==1.1.1
     # via pytest
 packaging==21.3
@@ -60,6 +62,8 @@ responses==0.21.0
     # via -r requirements.in
 rfc3986[idna2008]==1.5.0
     # via httpx
+smart-open==6.3.0
+    # via -r requirements.in
 sniffio==1.3.0
     # via
     #   anyio
diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
     long_description_content_type='text/markdown',
     author="Codecov",
     author_email="support@codecov.io",
-    install_requires=["click", "requests", "PyYAML", "tree_sitter", "httpx"],
+    install_requires=["click", "requests", "PyYAML", "tree_sitter", "httpx", "pytest", "pytest-cov", "ijson", "smart-open"],
     entry_points={
         "console_scripts": [
             "codecovcli = codecov_cli.main:run",
diff --git a/tests/plugins/test_compress_pycoverage_contexts.py b/tests/plugins/test_compress_pycoverage_contexts.py
diff --git a/tests/plugins/test_instantiation.py b/tests/plugins/test_instantiation.py