github
diff --git a/‎misc/scripts/library-coverage/frameworks.py
Lines changed: 72 additions & 0 deletions b/‎misc/scripts/library-coverage/frameworks.py
Lines changed: 72 additions & 0 deletions
diff --git a/‎misc/scripts/library-coverage/generate-report.py
Lines changed: 31 additions & 93 deletions b/‎misc/scripts/library-coverage/generate-report.py
Lines changed: 31 additions & 93 deletions
@@ -0,0 +1,72 @@
+import csv
+import sys
+import packages
+
+
+class Framework:
+    """
+    Frameworks are the aggregation units in the RST and timeseries report. These are read from the frameworks.csv file.
+    """
+
+    def __init__(self, name, url, package_pattern):
+        self.name = name
+        self.url = url
+        self.package_pattern = package_pattern
+
+
+class FrameworkCollection:
+    """
+    A (sorted) list of frameworks.
+    """
+
+    def __init__(self, path):
+        self.frameworks: list[Framework] = []
+        self.package_patterns = set()
+
+        with open(path) as csvfile:
+            reader = csv.reader(csvfile)
+            next(reader)
+            for row in reader:
+                # row: Hibernate,https://hibernate.org/,org.hibernate
+                self.__add(Framework(row[0], row[1], row[2]))
+        self.__sort()
+
+    def __add(self, framework: Framework):
+        if framework.package_pattern not in self.package_patterns:
+            self.package_patterns.add(framework.package_pattern)
+            self.frameworks.append(framework)
+        else:
+            print("Package pattern already exists: " +
+                  framework.package_pattern, file=sys.stderr)
+
+    def __sort(self):
+        self.frameworks.sort(key=lambda f: f.name)
+
+    def get(self, framework_name):
+        for framework in self.frameworks:
+            if framework.name == framework_name:
+                return framework
+        return None
+
+    def get_patterns(self):
+        return self.package_patterns
+
+    def get_frameworks(self):
+        return self.frameworks
+
+    def __package_match(self, package: packages.Package, pattern):
+        return (pattern.endswith("*") and package.name.startswith(pattern[:-1])) or (not pattern.endswith("*") and pattern == package.name)
+
+    def get_package_filter(self, framework: Framework):
+        """
+        Returns a lambda filter that holds for packages that match the current framework.
+
+        The pattern is either full name, such as "org.hibernate", or a prefix, such as "java.*"
+        Package patterns might overlap, in case of 'org.apache.commons.io' and 'org.apache.*', the statistics for
+        the latter will not include the statistics for the former.
+        """
+        return lambda p: \
+            self.__package_match(p, framework.package_pattern) and \
+            all(
+                len(framework.package_pattern) >= len(pattern) or
+                not self.__package_match(p, pattern) for pattern in self.get_patterns())
@@ -4,6 +4,8 @@
 import shutil
 import settings
 import utils
+import packages as pack
+import frameworks as fr
 
 """
 This script runs the CSV coverage report QL query, and transforms it to a more readable format.
@@ -28,14 +30,7 @@ def append_csv_dict_item(list, dictionary, key):
         list.append(None)
 
 
-def increment_dict_item(value, dictionary, key):
-    """Increments the value of the dictionary[key] by value."""
-    if key not in dictionary:
-        dictionary[key] = 0
-    dictionary[key] += int(value)
-
-
-def collect_package_stats(packages, cwes, filter):
+def collect_package_stats(packages: pack.PackageCollection, cwes, filter):
     """
     Collects coverage statistics for packages matching the given filter. `filter` is a `lambda` that for example (i) matches
     packages to frameworks, or (2) matches packages that were previously not processed.
@@ -48,20 +43,21 @@ def collect_package_stats(packages, cwes, filter):
     framework_cwes = {}
     processed_packages = set()
 
-    for package in packages:
+    for package in packages.get_packages():
+        package: pack.Package = package
         if filter(package):
             processed_packages.add(package)
-            sources += int(packages[package]["kind"].get("source:remote", 0))
-            steps += int(packages[package]["part"].get("summary", 0))
-            sinks += int(packages[package]["part"].get("sink", 0))
+            sources += package.get_kind_count("source:remote")
+            steps += package.get_part_count("summary")
+            sinks += package.get_part_count("sink")
 
             for cwe in cwes:
                 sink = "sink:" + cwes[cwe]["sink"]
-                if sink in packages[package]["kind"]:
+                count = package.get_kind_count(sink)
+                if count > 0:
                     if cwe not in framework_cwes:
                         framework_cwes[cwe] = 0
-                    framework_cwes[cwe] += int(
-                        packages[package]["kind"][sink])
+                    framework_cwes[cwe] += count
 
     return sources, steps, sinks, framework_cwes, processed_packages
 
@@ -137,37 +133,12 @@ def add_package_stats_to_row(row, sorted_cwes, collect):
     utils.run_codeql_query(config.ql_path, db, ql_output)
     shutil.rmtree(db)
 
-    packages = {}
-    parts = set()
-    kinds = set()
-
-    # Read the generated CSV file, and collect package statistics.
-    with open(ql_output) as csvfile:
-        reader = csv.reader(csvfile)
-        for row in reader:
-            # row: "android.util",1,"remote","source",16
-            package = row[0]
-            if package not in packages:
-                packages[package] = {
-                    "count": row[1],
-                    # part: "summary", "sink", or "source"
-                    "part": {},
-                    # kind: "source:remote", "sink:create-file", ...
-                    "kind": {}
-                }
-
-            part = row[3]
-            parts.add(part)
-            increment_dict_item(row[4], packages[package]["part"], part)
-
-            kind = part + ":" + row[2]
-            kinds.add(kind)
-            increment_dict_item(row[4], packages[package]["kind"], kind)
+    packages = pack.PackageCollection(ql_output)
 
     os.remove(ql_output)
 
-    parts = sorted(parts)
-    kinds = sorted(kinds)
+    parts = packages.get_parts()
+    kinds = packages.get_kinds()
 
     # Write the denormalized package statistics to a CSV file.
     with open(output_csv.format(language=lang), 'w', newline='') as csvfile:
@@ -179,44 +150,21 @@ def add_package_stats_to_row(row, sorted_cwes, collect):
 
         csvwriter.writerow(headers)
 
-        for package in sorted(packages):
-            row = [package]
+        for package in packages.get_packages():
+            package: pack.Package = package
+            row = [package.name]
             for part in parts:
-                append_csv_dict_item(row, packages[package]["part"], part)
+                append_csv_number(row, package.get_part_count(part))
             for kind in kinds:
-                append_csv_dict_item(row, packages[package]["kind"], kind)
+                append_csv_number(row, package.get_kind_count(kind))
             csvwriter.writerow(row)
 
     # Read the additional framework data, such as URL, friendly name
-    frameworks = {}
-
-    with open(input_framework_csv.format(language=lang)) as csvfile:
-        reader = csv.reader(csvfile)
-        next(reader)
-        for row in reader:
-            # row: Hibernate,https://hibernate.org/,org.hibernate
-            framwork = row[0]
-            if framwork not in frameworks:
-                frameworks[framwork] = {
-                    "package": row[2],
-                    "url": row[1]
-                }
+    frameworks = fr.FrameworkCollection(
+        input_framework_csv.format(language=lang))
 
     # Read the additional CWE data
-    cwes = {}
-
-    with open(input_cwe_sink_csv.format(language=lang)) as csvfile:
-        reader = csv.reader(csvfile)
-        next(reader)
-        for row in reader:
-            # row: CWE-89,sql,SQL injection
-            cwe = row[0]
-            if cwe not in cwes:
-                cwes[cwe] = {
-                    "sink": row[1],
-                    "label": row[2]
-                }
-
+    cwes = utils.read_cwes(input_cwe_sink_csv.format(language=lang))
     sorted_cwes = sorted(cwes)
 
     with open(output_rst.format(language=lang), 'w', newline='') as rst_file:
@@ -246,34 +194,24 @@ def add_package_stats_to_row(row, sorted_cwes, collect):
 
         processed_packages = set()
 
-        all_package_patterns = set(
-            (frameworks[fr]["package"] for fr in frameworks))
-
         # Write a row for each framework.
-        for framework in sorted(frameworks):
+        for framework in frameworks.get_frameworks():
+            framework: fr.Framework = framework
             row = []
 
             # Add the framework name to the row
-            if not frameworks[framework]["url"]:
-                row.append(row_prefix + framework)
+            if not framework.url:
+                row.append(row_prefix + framework.name)
             else:
                 row.append(
-                    row_prefix + "`" + framework + " <" + frameworks[framework]["url"] + ">`_")
+                    row_prefix + "`" + framework.name + " <" + framework.url + ">`_")
 
             # Add the package name to the row
-            row.append("``" + frameworks[framework]["package"] + "``")
-
-            current_package_pattern = frameworks[framework]["package"]
+            row.append("``" + framework.package_pattern + "``")
 
             # Collect statistics on the current framework
-            # current_package_pattern is either full name, such as "org.hibernate", or a prefix, such as "java.*"
-            # Package patterns might overlap, in case of 'org.apache.commons.io' and 'org.apache.*', the statistics for
-            # the latter will not include the statistics for the former.
-            def package_match(package_name, pattern): return (pattern.endswith(
-                "*") and package_name.startswith(pattern[:-1])) or (not pattern.endswith("*") and pattern == package_name)
-
             def collect_framework(): return collect_package_stats(
-                packages, cwes, lambda p: package_match(p, current_package_pattern) and all(len(current_package_pattern) >= len(pattern) or not package_match(p, pattern) for pattern in all_package_patterns))
+                packages, cwes, frameworks.get_package_filter(framework))
 
             row, f_processed_packages = add_package_stats_to_row(
                 row, sorted_cwes, collect_framework)
@@ -290,8 +228,8 @@ def collect_others(): return collect_package_stats(
         row, other_packages = add_package_stats_to_row(
             row, sorted_cwes, collect_others)
 
-        row[1] = ", ".join("``{0}``".format(p)
-                           for p in sorted(other_packages))
+        row[1] = ", ".join("``{0}``".format(p.name)
+                           for p in sorted(other_packages, key=lambda x: x.name))
 
         csvwriter.writerow(row)