From fb866672321c979dddd568192cd0c971cf87b3e5 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Tue, 7 Oct 2025 19:08:41 +0530 Subject: [PATCH 1/5] Refine D2D pipeline for Scala and Kotlin Signed-off-by: Tushar Goel --- scanpipe/pipelines/deploy_to_develop.py | 52 +++++- scanpipe/pipes/d2d.py | 150 +++++++--------- scanpipe/pipes/d2d_config.py | 10 ++ scanpipe/pipes/jvm.py | 221 ++++++++++++++++-------- scanpipe/tests/pipes/test_d2d.py | 31 ++-- scanpipe/tests/pipes/test_jvm.py | 22 ++- 6 files changed, 304 insertions(+), 182 deletions(-) diff --git a/scanpipe/pipelines/deploy_to_develop.py b/scanpipe/pipelines/deploy_to_develop.py index d047f3de03..48a7bea3d4 100644 --- a/scanpipe/pipelines/deploy_to_develop.py +++ b/scanpipe/pipelines/deploy_to_develop.py @@ -27,6 +27,7 @@ from scanpipe.pipes import d2d_config from scanpipe.pipes import flag from scanpipe.pipes import input +from scanpipe.pipes import jvm from scanpipe.pipes import matchcode from scanpipe.pipes import purldb from scanpipe.pipes import scancode @@ -73,6 +74,10 @@ def steps(cls): cls.find_java_packages, cls.map_java_to_class, cls.map_jar_to_source, + cls.find_scala_packages, + cls.map_scala_to_class, + cls.find_kotlin_packages, + cls.map_kotlin_to_class, cls.map_javascript, cls.map_javascript_symbols, cls.map_javascript_strings, @@ -168,17 +173,58 @@ def match_archives_to_purldb(self): @optional_step("Java") def find_java_packages(self): """Find the java package of the .java source files.""" - d2d.find_java_packages(self.project, logger=self.log) + d2d.find_jvm_packages( + project=self.project, jvm_lang=jvm.JavaLanguage, logger=self.log + ) @optional_step("Java") def map_java_to_class(self): """Map a .class compiled file to its .java source.""" - d2d.map_java_to_class(project=self.project, logger=self.log) + d2d.map_jvm_to_class( + project=self.project, logger=self.log, jvm_lang=jvm.JavaLanguage + ) @optional_step("Java") def map_jar_to_source(self): """Map .jar files to their related source directory.""" - d2d.map_jar_to_source(project=self.project, logger=self.log) + d2d.map_jar_to_jvm_source( + project=self.project, logger=self.log, jvm_lang=jvm.JavaLanguage + ) + + @optional_step("Scala") + def find_scala_packages(self): + """Find the java package of the .scala source files.""" + d2d.find_jvm_packages( + project=self.project, jvm_lang=jvm.ScalaLanguage, logger=self.log + ) + + @optional_step("Scala") + def map_scala_to_class(self): + """Map a .class compiled file to its .java source.""" + d2d.map_jvm_to_class( + project=self.project, logger=self.log, jvm_lang=jvm.ScalaLanguage + ) + + @optional_step("Scala") + def map_jar_to_scala_source(self): + """Map .jar files to their related source directory.""" + d2d.map_jar_to_jvm_source( + project=self.project, logger=self.log, jvm_lang=jvm.ScalaLanguage + ) + + @optional_step("Kotlin") + def find_kotlin_packages(self): + """Find the java package of the .java source files.""" + d2d.find_jvm_packages( + project=self.project, jvm_lang=jvm.KotlinLanguage, logger=self.log + ) + + @optional_step("Kotlin") + def map_kotlin_to_class(self): + """Map a .class compiled file to its .java source.""" + d2d.map_jvm_to_class( + project=self.project, logger=self.log, jvm_lang=jvm.KotlinLanguage + ) @optional_step("JavaScript") def map_javascript(self): diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index 40a54fc693..1db9f81f5e 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -161,33 +161,38 @@ def map_checksum(project, checksum_field, logger=None): _map_checksum_resource(to_resource, from_resources, checksum_field) -def _map_java_to_class_resource(to_resource, from_resources, from_classes_index): +def _map_jvm_to_class_resource( + to_resource, from_resources, from_classes_index, jvm_lang: jvm.JvmLanguage +): """ Map the ``to_resource`` .class file Resource with a Resource in ``from_resources`` .java files, using the ``from_classes_index`` index of from/ fully qualified Java class names. """ - normalized_java_path = jvm.get_normalized_java_path(to_resource.path) - match = pathmap.find_paths(path=normalized_java_path, index=from_classes_index) - if not match: - return - - for resource_id in match.resource_ids: - from_resource = from_resources.get(id=resource_id) - # compute the root of the packages on the source side - from_source_root_parts = from_resource.path.strip("/").split("/") - from_source_root = "/".join( - from_source_root_parts[: -match.matched_path_length] - ) - pipes.make_relation( - from_resource=from_resource, - to_resource=to_resource, - map_type="java_to_class", - extra_data={"from_source_root": f"{from_source_root}/"}, + for extension in jvm_lang.source_extensions: + normalized_path = jvm_lang.get_normalized_path( + path=to_resource.path, extension=extension ) + match = pathmap.find_paths(path=normalized_path, index=from_classes_index) + if not match: + return + + for resource_id in match.resource_ids: + from_resource = from_resources.get(id=resource_id) + # compute the root of the packages on the source side + from_source_root_parts = from_resource.path.strip("/").split("/") + from_source_root = "/".join( + from_source_root_parts[: -match.matched_path_length] + ) + pipes.make_relation( + from_resource=from_resource, + to_resource=to_resource, + map_type=jvm_lang.binary_map_type, + extra_data={"from_source_root": f"{from_source_root}/"}, + ) -def map_java_to_class(project, logger=None): +def map_jvm_to_class(project, jvm_lang: jvm.JvmLanguage, logger=None): """ Map to/ compiled Java .class(es) to from/ .java source using Java fully qualified paths and indexing from/ .java files. @@ -196,112 +201,76 @@ def map_java_to_class(project, logger=None): from_resources = project_files.from_codebase() to_resources = project_files.to_codebase().has_no_relation() - to_resources_dot_class = to_resources.filter(extension=".class") - from_resources_dot_java = ( - from_resources.filter(extension=".java") + filter = {f"extra_data__{jvm_lang.source_package_attribute_name}__isnull": False} + + to_resources_binary_extension = to_resources.filter( + extension__in=jvm_lang.binary_extensions + ) + from_resources_source_extension = ( + from_resources.filter(extension__in=jvm_lang.source_extensions) # The "java_package" extra_data value is set during the `find_java_packages`, # it is required to build the index. - .filter(extra_data__java_package__isnull=False) + .filter(**filter) ) - to_resource_count = to_resources_dot_class.count() - from_resource_count = from_resources_dot_java.count() + to_resource_count = to_resources_binary_extension.count() + from_resource_count = from_resources_source_extension.count() if not from_resource_count: - logger("No .java resources to map.") + logger(f"No {jvm_lang.source_extensions} resources to map.") return if logger: logger( f"Mapping {to_resource_count:,d} .class resources to " - f"{from_resource_count:,d} .java" + f"{from_resource_count:,d} {jvm_lang.source_extensions}" ) # build an index using from-side Java fully qualified class file names # built from the "java_package" and file name - indexables = get_indexable_qualified_java_paths(from_resources_dot_java) + indexables = jvm_lang.get_indexable_qualified_paths(from_resources_source_extension) # we do not index subpath since we want to match only fully qualified names from_classes_index = pathmap.build_index(indexables, with_subpaths=False) - resource_iterator = to_resources_dot_class.iterator(chunk_size=2000) + resource_iterator = to_resources_binary_extension.iterator(chunk_size=2000) progress = LoopProgress(to_resource_count, logger) for to_resource in progress.iter(resource_iterator): - _map_java_to_class_resource(to_resource, from_resources, from_classes_index) - - -def get_indexable_qualified_java_paths_from_values(resource_values): - """ - Yield tuples of (resource id, fully-qualified Java path) for indexable - classes from a list of ``resource_data`` tuples of "from/" side of the - project codebase. - - These ``resource_data`` input tuples are in the form: - (resource.id, resource.name, resource.extra_data) - - And the output tuples look like this example:: - (123, "org/apache/commons/LoggerImpl.java") - """ - for resource_id, resource_name, resource_extra_data in resource_values: - fully_qualified = jvm.get_fully_qualified_java_path( - java_package=resource_extra_data.get("java_package"), - filename=resource_name, + _map_jvm_to_class_resource( + to_resource, from_resources, from_classes_index, jvm_lang ) - yield resource_id, fully_qualified - - -def get_indexable_qualified_java_paths(from_resources_dot_java): - """ - Yield tuples of (resource id, fully-qualified Java class name) for indexable - classes from the "from/" side of the project codebase using the - "java_package" Resource.extra_data. - """ - resource_values = from_resources_dot_java.values_list("id", "name", "extra_data") - return get_indexable_qualified_java_paths_from_values(resource_values) -def find_java_packages(project, logger=None): +def find_jvm_packages(project, jvm_lang: jvm.JvmLanguage, logger=None): """ - Collect the Java packages of Java source files for a ``project``. + Collect the JVM packages of Java source files for a ``project``. Multiprocessing is enabled by default on this pipe, the number of processes can be controlled through the SCANCODEIO_PROCESSES setting. Note: we use the same API as the ScanCode scans by design """ - from_java_resources = ( - project.codebaseresources.files() - .no_status() - .from_codebase() - .has_no_relation() - .filter(extension=".java") + resources = ( + project.codebaseresources.files().no_status().from_codebase().has_no_relation() ) + from_jvm_resources = resources.filter(extension__in=jvm_lang.source_extensions) + if logger: logger( - f"Finding Java package for {from_java_resources.count():,d} " - ".java resources." + f"Finding {jvm_lang.name} packages for {from_jvm_resources.count():,d} " + f"{jvm_lang.source_extensions} resources." ) scancode.scan_resources( - resource_qs=from_java_resources, - scan_func=scan_for_java_package, - save_func=save_java_package_scan_results, + resource_qs=from_jvm_resources, + scan_func=jvm_lang.scan_for_source_package, + save_func=save_jvm_package_scan_results, progress_logger=logger, ) -def scan_for_java_package(location, with_threading=True): - """ - Run a Java package scan on provided ``location``. - - Return a dict of scan ``results`` and a list of ``errors``. - """ - scanners = [scancode.Scanner("java_package", jvm.get_java_package)] - return scancode._scan_resource(location, scanners, with_threading=with_threading) - - -def save_java_package_scan_results(codebase_resource, scan_results, scan_errors): +def save_jvm_package_scan_results(codebase_resource, scan_results, scan_errors): """ Save the resource Java package scan results in the database as Resource.extra_data. Create project errors if any occurred during the scan. @@ -314,11 +283,14 @@ def save_java_package_scan_results(codebase_resource, scan_results, scan_errors) codebase_resource.update_extra_data(scan_results) -def _map_jar_to_source_resource(jar_resource, to_resources, from_resources): +def _map_jar_to_jvm_source_resource( + jar_resource, to_resources, from_resources, jvm_lang: jvm.JvmLanguage +): jar_extracted_path = get_extracted_path(jar_resource) jar_extracted_dot_class_files = list( to_resources.filter( - extension=".class", path__startswith=jar_extracted_path + extension__in=jvm_lang.binary_extensions, + path__startswith=jar_extracted_path, ).values("id", "status") ) @@ -338,7 +310,7 @@ def _map_jar_to_source_resource(jar_resource, to_resources, from_resources): dot_class_file.get("id") for dot_class_file in jar_extracted_dot_class_files ] java_to_class_extra_data_list = CodebaseRelation.objects.filter( - to_resource__in=dot_class_file_ids, map_type="java_to_class" + to_resource__in=dot_class_file_ids, map_type=jvm_lang.binary_map_type ).values_list("extra_data", flat=True) from_source_roots = [ @@ -358,7 +330,7 @@ def _map_jar_to_source_resource(jar_resource, to_resources, from_resources): ) -def map_jar_to_source(project, logger=None): +def map_jar_to_jvm_source(project, jvm_lang: jvm.JvmLanguage, logger=None): """Map .jar files to their related source directory.""" project_files = project.codebaseresources.files() # Include the directories to map on the common source @@ -377,7 +349,9 @@ def map_jar_to_source(project, logger=None): progress = LoopProgress(to_jars_count, logger) for jar_resource in progress.iter(resource_iterator): - _map_jar_to_source_resource(jar_resource, to_resources, from_resources) + _map_jar_to_jvm_source_resource( + jar_resource, to_resources, from_resources, jvm_lang=jvm_lang + ) def _map_path_resource( diff --git a/scanpipe/pipes/d2d_config.py b/scanpipe/pipes/d2d_config.py index f329dbe65b..5baebf61db 100644 --- a/scanpipe/pipes/d2d_config.py +++ b/scanpipe/pipes/d2d_config.py @@ -86,6 +86,16 @@ class EcosystemConfig: matchable_package_extensions=[".jar", ".war"], matchable_resource_extensions=[".class"], ), + "Scala": EcosystemConfig( + ecosystem_option="Scala", + matchable_package_extensions=[".jar", ".war"], + matchable_resource_extensions=[".class"], + ), + "Kotlin": EcosystemConfig( + ecosystem_option="Kotlin", + matchable_package_extensions=[".jar", ".war"], + matchable_resource_extensions=[".class"], + ), "JavaScript": EcosystemConfig( ecosystem_option="JavaScript", matchable_resource_extensions=[ diff --git a/scanpipe/pipes/jvm.py b/scanpipe/pipes/jvm.py index f6fb506eb8..ede6f3fb54 100644 --- a/scanpipe/pipes/jvm.py +++ b/scanpipe/pipes/jvm.py @@ -24,91 +24,168 @@ import re from pathlib import Path +from re import Pattern + +from scanpipe.pipes import scancode + + +class JvmLanguage: + # Name of the JVM language like java, kotlin or scala, just as an FYI + name: str = None + # Tuple of source file extensions + source_extensions: tuple = tuple() + # Tuple of binary file extensions + binary_extensions: tuple = (".class",) + # Like java_package, kotlin_package, scala_package, used as an attribute in resource + source_package_attribute_name: str = None + # A regex pattern to extract a package from a source file + package_regex: Pattern = None + # Type of relation for a binary file to its source file + binary_map_type: str = None + + @classmethod + def get_source_package(cls, location, **kwargs): + """ + Read the source file at ``location`` and return a source package for this + language as a mapping with a single key using the value of + ``source_package_attribute_name`` as a key name or None. + + Note: this is the same API as a ScanCode Toolkit API scanner function by + design. + """ + if not location: + return -java_package_re = re.compile(r"^\s*package\s+([\w\.]+)\s*;") - - -def get_java_package(location, java_extensions=(".java",), **kwargs): - """ - Return a Java package as a mapping with a single "java_package" key, or ``None`` - from the .java source code file at ``location``. - - Only look at files with an extension in the ``java_extensions`` tuple. - - Note: this is the same API as a ScanCode Toolkit API scanner function by - design. - """ - if not location: - return - - if not isinstance(location, Path): - location = Path(location) - - if location.suffix not in java_extensions: - return - - with open(location) as lines: - return find_java_package(lines) - - -def find_java_package(lines): - """ - Return a mapping of ``{'java_package': }`` or ``None`` from an iterable or - text ``lines``. - - For example:: - - >>> lines = [" package foo.back ; # dsasdasdasdasdasda.asdasdasd"] - >>> assert find_java_package(lines) == {"java_package": "foo.back"} - """ - package = _find_java_package(lines) - if package: - return {"java_package": package} + if not isinstance(location, Path): + location = Path(location) + if location.suffix not in cls.source_extensions: + return -def _find_java_package(lines): + with open(location) as lines: + return cls.find_source_package(lines) + + @classmethod + def find_source_package(cls, lines): + package = find_expression(lines=lines, regex=cls.package_regex) + if package: + return {cls.source_package_attribute_name: package} + + @classmethod + def scan_for_source_package(cls, location, with_threading=True): + """ + Run a Jvm source package scan on the file at ``location``. + + Return a mapping of scan ``results`` and a list of ``errors``. + """ + scanners = [ + scancode.Scanner( + name=f"{cls.source_package_attribute_name}", + function=cls.get_source_package, + ) + ] + return scancode._scan_resource( + location=location, scanners=scanners, with_threading=with_threading + ) + + @classmethod + def get_indexable_qualified_paths(cls, from_resources_dot_java): + """ + Yield tuples of (resource id, fully-qualified Java class name) for indexable + classes from the "from/" side of the project codebase using the + "java_package" Resource.extra_data. + """ + resource_values = from_resources_dot_java.values_list( + "id", "name", "extra_data" + ) + return cls.get_indexable_qualified_paths_from_values(resource_values) + + @classmethod + def get_indexable_qualified_paths_from_values(cls, resource_values): + """ + Yield tuples of (resource id, fully-qualified Java path) for indexable + classes from a list of ``resource_data`` tuples of "from/" side of the + project codebase. + + These ``resource_data`` input tuples are in the form: + (resource.id, resource.name, resource.extra_data) + + And the output tuples look like this example:: + (123, "org/apache/commons/LoggerImpl.java") + """ + for resource_id, resource_name, resource_extra_data in resource_values: + fully_qualified = get_fully_qualified_path( + jvm_package=resource_extra_data.get(cls.source_package_attribute_name), + filename=resource_name, + ) + yield resource_id, fully_qualified + + @classmethod + def get_normalized_path(cls, path, extension): + """ + Return a normalized JVM file path for ``path`` .class file path string. + Account for inner classes in that their file name is the name of their + outer class. + """ + if not path.endswith(cls.binary_extensions): + raise ValueError( + f"Only path ending with {cls.binary_extensions} are supported." + ) + path = Path(path.strip("/")) + class_name = path.name + if "$" in class_name: # inner class + class_name, _, _ = class_name.partition("$") + else: + class_name, _, _ = class_name.partition(".") # plain .class + return str(path.parent / f"{class_name}{extension}") + + +def find_expression(lines, regex): """ - Return a Java package or ``None`` from an iterable or text ``lines``. - + Return a value found using ``regex`` in the first 500 ``lines`` or ``None``. For example:: >>> lines = [" package foo.back ; # dsasdasdasdasdasda.asdasdasd"] - >>> assert _find_java_package(lines) == "foo.back", _find_java_package(lines) + >>> regex = java_package_re + >>> assert find_expression(lines, regex) == "foo.back" """ for ln, line in enumerate(lines): # only look at the first 500 lines if ln > 500: return - for package in java_package_re.findall(line): - if package: - return package + for value in regex.findall(line): + if value: + return value -def get_normalized_java_path(path): - """ - Return a normalized .java file path for ``path`` .class file path string. - Account for inner classes in that their .java file name is the name of their - outer class. +class JavaLanguage(JvmLanguage): + name = "java" + source_extensions = (".java",) + binary_extensions = (".class",) + source_package_attribute_name = "java_package" + package_regex = re.compile(r"^\s*package\s+([\w\.]+)\s*;?") + binary_map_type = "java_to_class" - For example:: - >>> get_normalized_java_path("foo/org/common/Bar$inner.class") - 'foo/org/common/Bar.java' - >>> get_normalized_java_path("foo/org/common/Bar.class") - 'foo/org/common/Bar.java' - """ - if not path.endswith(".class"): - raise ValueError("Only path ending with .class are supported.") - path = Path(path.strip("/")) - class_name = path.name - if "$" in class_name: # inner class - class_name, _, _ = class_name.partition("$") - else: - class_name, _, _ = class_name.partition(".") # plain .class - return str(path.parent / f"{class_name}.java") - - -def get_fully_qualified_java_path(java_package, filename): +class ScalaLanguage(JvmLanguage): + name = "scala" + source_extensions = (".scala",) + binary_extensions = (".class",) + source_package_attribute_name = "scala_package" + package_regex = re.compile(r"^\s*package\s+([\w\.]+)\s*;?") + binary_map_type = "scala_to_class" + + +class KotlinLanguage(JvmLanguage): + name = "kotlin" + source_extensions = (".kt", ".kts") + binary_extensions = (".class",) + source_package_attribute_name = "kotlin_package" + package_regex = re.compile(r"^\s*package\s+([\w\.]+)\s*;?") + binary_map_type = "kotlin_to_class" + + +def get_fully_qualified_path(jvm_package, filename): """ Return a fully qualified java path of a .java ``filename`` in a ``java_package`` string. @@ -116,8 +193,8 @@ def get_fully_qualified_java_path(java_package, filename): For example:: - >>> get_fully_qualified_java_path("org.common" , "Bar.java") + >>> get_fully_qualified_path("org.common" , "Bar.java") 'org/common/Bar.java' """ - java_package = java_package.replace(".", "/") - return f"{java_package}/{filename}" + jvm_package = jvm_package.replace(".", "/") + return f"{jvm_package}/{filename}" diff --git a/scanpipe/tests/pipes/test_d2d.py b/scanpipe/tests/pipes/test_d2d.py index b81b3510c4..354900af58 100644 --- a/scanpipe/tests/pipes/test_d2d.py +++ b/scanpipe/tests/pipes/test_d2d.py @@ -40,6 +40,7 @@ from scanpipe.pipes import d2d from scanpipe.pipes import d2d_config from scanpipe.pipes import flag +from scanpipe.pipes import jvm from scanpipe.pipes import scancode from scanpipe.pipes import symbols from scanpipe.pipes.input import copy_input @@ -367,9 +368,11 @@ def test_scanpipe_pipes_d2d_map_java_to_class(self): ) buffer = io.StringIO() - d2d.map_java_to_class(self.project1, logger=buffer.write) + d2d.map_jvm_to_class( + self.project1, logger=buffer.write, jvm_lang=jvm.JavaLanguage + ) - expected = "Mapping 3 .class resources to 2 .java" + expected = "Mapping 3 .class resources to 2 ('.java',)" self.assertIn(expected, buffer.getvalue()) self.assertEqual(2, self.project1.codebaserelations.count()) @@ -393,8 +396,10 @@ def test_scanpipe_pipes_d2d_map_java_to_class(self): def test_scanpipe_pipes_d2d_map_java_to_class_no_java(self): make_resource_file(self.project1, path="to/Abstract.class") buffer = io.StringIO() - d2d.map_java_to_class(self.project1, logger=buffer.write) - expected = "No .java resources to map." + d2d.map_jvm_to_class( + self.project1, logger=buffer.write, jvm_lang=jvm.JavaLanguage + ) + expected = "No ('.java',) resources to map." self.assertIn(expected, buffer.getvalue()) def test_scanpipe_pipes_d2d_map_jar_to_source(self): @@ -423,7 +428,9 @@ def test_scanpipe_pipes_d2d_map_jar_to_source(self): ) buffer = io.StringIO() - d2d.map_java_to_class(self.project1, logger=buffer.write) + d2d.map_jvm_to_class( + self.project1, logger=buffer.write, jvm_lang=jvm.JavaLanguage + ) relation = self.project1.codebaserelations.get() self.assertEqual(from1, relation.from_resource) self.assertEqual(to1, relation.to_resource) @@ -433,7 +440,9 @@ def test_scanpipe_pipes_d2d_map_jar_to_source(self): buffer = io.StringIO() with self.assertNumQueries(6): - d2d.map_jar_to_source(self.project1, logger=buffer.write) + d2d.map_jar_to_jvm_source( + self.project1, logger=buffer.write, jvm_lang=jvm.JavaLanguage + ) expected = "Mapping 1 .jar resources using map_jar_to_source" self.assertIn(expected, buffer.getvalue()) @@ -460,7 +469,7 @@ def test_scanpipe_pipes_d2d_map_jar_to_source_works_for_jar(self): path="to/org/apache/logging/log4j/core/util/SystemClock.class", ) - d2d.map_java_to_class(self.project1) + d2d.map_jvm_to_class(self.project1, jvm_lang=jvm.JavaLanguage) expected = [ (from1.path, to1.path, "java_to_class"), @@ -495,7 +504,7 @@ def test_scanpipe_pipes_d2d_get_indexable_qualified_java_paths_from_values_yield (2, "org/apache/logging/log4j/core/util/SystemClock2.java"), ] results = list( - d2d.get_indexable_qualified_java_paths_from_values(resource_values) + jvm.JavaLanguage.get_indexable_qualified_paths_from_values(resource_values) ) self.assertEqual(expected, results) @@ -550,9 +559,11 @@ def test_scanpipe_pipes_d2d_find_java_packages(self): pipes.collect_and_create_codebase_resources(self.project1) buffer = io.StringIO() - d2d.find_java_packages(self.project1, logger=buffer.write) + d2d.find_jvm_packages( + self.project1, jvm_lang=jvm.JavaLanguage, logger=buffer.write + ) - expected = "Finding Java package for 2 .java resources." + expected = "Finding java packages for 2 ('.java',) resources." self.assertEqual(expected, buffer.getvalue()) expected = [ diff --git a/scanpipe/tests/pipes/test_jvm.py b/scanpipe/tests/pipes/test_jvm.py index 9d9588a813..b2f819ac4c 100644 --- a/scanpipe/tests/pipes/test_jvm.py +++ b/scanpipe/tests/pipes/test_jvm.py @@ -55,41 +55,45 @@ class ScanPipeJvmTest(TestCase): data = Path(__file__).parent.parent / "data" def test_scanpipe_pipes_jvm_find_java_package(self): - package = jvm.find_java_package(java_code.splitlines()) + package = jvm.JavaLanguage.find_source_package(java_code.splitlines()) self.assertEqual({"java_package": "org.apache.logging.log4j.core"}, package) def test_scanpipe_pipes_jvm_find_java_package_with_spaces(self): lines = [" package foo.back ; # dsasdasdasdasdasda.asdasdasd"] - package = jvm.find_java_package(lines) + package = jvm.JavaLanguage.find_source_package(lines) self.assertEqual({"java_package": "foo.back"}, package) def test_scanpipe_pipes_jvm_find_java_package_return_None(self): - package = jvm.find_java_package(java_package_too_far_down.splitlines()) + package = jvm.JavaLanguage.find_source_package( + java_package_too_far_down.splitlines() + ) self.assertIsNone(package) def test_scanpipe_pipes_jvm_get_java_package(self): input_location = self.data / "jvm" / "common.java" - package = jvm.get_java_package(input_location) + package = jvm.JavaLanguage.get_source_package(input_location) self.assertEqual({"java_package": "org.apache.logging.log4j.core"}, package) def test_scanpipe_pipes_jvm_get_java_package_with_string(self): input_location = self.data / "jvm" / "common.java" - package = jvm.get_java_package(str(input_location)) + package = jvm.JavaLanguage.get_source_package(str(input_location)) self.assertEqual({"java_package": "org.apache.logging.log4j.core"}, package) def test_scanpipe_pipes_jvm_get_java_package_too_far_down(self): input_location = self.data / "jvm" / "no-package.java" - package = jvm.get_java_package(input_location) + package = jvm.JavaLanguage.get_source_package(input_location) self.assertIsNone(package) def test_scanpipe_pipes_jvm_get_normalized_java_path(self): - njp = jvm.get_normalized_java_path("foo/org/common/Bar.class") + njp = jvm.JavaLanguage.get_normalized_path("foo/org/common/Bar.class", ".java") self.assertEqual("foo/org/common/Bar.java", njp) def test_scanpipe_pipes_jvm_get_normalized_java_path_with_inner_class(self): - njp = jvm.get_normalized_java_path("foo/org/common/Bar$inner.class") + njp = jvm.JavaLanguage.get_normalized_path( + "foo/org/common/Bar$inner.class", ".java" + ) self.assertEqual("foo/org/common/Bar.java", njp) def test_scanpipe_pipes_jvm_get_fully_qualified_java_path(self): - fqjp = jvm.get_fully_qualified_java_path("org.common", "Bar.java") + fqjp = jvm.get_fully_qualified_path("org.common", "Bar.java") self.assertEqual("org/common/Bar.java", fqjp) From a8dcd39500d045388d9c1793645112dc177058c6 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Tue, 7 Oct 2025 19:28:58 +0530 Subject: [PATCH 2/5] Add tests for scala and kotlin Signed-off-by: Tushar Goel --- scanpipe/pipelines/deploy_to_develop.py | 13 ++- scanpipe/tests/pipes/test_d2d.py | 100 +++++++++++++++++++++++- 2 files changed, 110 insertions(+), 3 deletions(-) diff --git a/scanpipe/pipelines/deploy_to_develop.py b/scanpipe/pipelines/deploy_to_develop.py index 48a7bea3d4..19f0557881 100644 --- a/scanpipe/pipelines/deploy_to_develop.py +++ b/scanpipe/pipelines/deploy_to_develop.py @@ -73,11 +73,13 @@ def steps(cls): cls.match_archives_to_purldb, cls.find_java_packages, cls.map_java_to_class, - cls.map_jar_to_source, + cls.map_jar_to_java_source, cls.find_scala_packages, cls.map_scala_to_class, + cls.map_jar_to_scala_source, cls.find_kotlin_packages, cls.map_kotlin_to_class, + cls.map_jar_to_kotlin_source, cls.map_javascript, cls.map_javascript_symbols, cls.map_javascript_strings, @@ -185,7 +187,7 @@ def map_java_to_class(self): ) @optional_step("Java") - def map_jar_to_source(self): + def map_jar_to_java_source(self): """Map .jar files to their related source directory.""" d2d.map_jar_to_jvm_source( project=self.project, logger=self.log, jvm_lang=jvm.JavaLanguage @@ -226,6 +228,13 @@ def map_kotlin_to_class(self): project=self.project, logger=self.log, jvm_lang=jvm.KotlinLanguage ) + @optional_step("Kotlin") + def map_jar_to_kotlin_source(self): + """Map .jar files to their related source directory.""" + d2d.map_jar_to_jvm_source( + project=self.project, logger=self.log, jvm_lang=jvm.KotlinLanguage + ) + @optional_step("JavaScript") def map_javascript(self): """ diff --git a/scanpipe/tests/pipes/test_d2d.py b/scanpipe/tests/pipes/test_d2d.py index 354900af58..54e3c1230a 100644 --- a/scanpipe/tests/pipes/test_d2d.py +++ b/scanpipe/tests/pipes/test_d2d.py @@ -402,7 +402,7 @@ def test_scanpipe_pipes_d2d_map_java_to_class_no_java(self): expected = "No ('.java',) resources to map." self.assertIn(expected, buffer.getvalue()) - def test_scanpipe_pipes_d2d_map_jar_to_source(self): + def test_scanpipe_pipes_d2d_map_jar_to_java_source(self): from1 = make_resource_file( self.project1, path="from/flume-ng-node-1.9.0-sources.jar-extract/org/apache/flume/node/" @@ -450,6 +450,104 @@ def test_scanpipe_pipes_d2d_map_jar_to_source(self): relation = self.project1.codebaserelations.get(map_type="jar_to_source") self.assertEqual(from2, relation.from_resource) self.assertEqual(to_jar, relation.to_resource) + + def test_scanpipe_pipes_d2d_map_jar_to_scala_source(self): + from1 = make_resource_file( + self.project1, + path="from/flume-ng-node-1.9.0-sources.jar-extract/org/apache/flume/node/" + "AbstractConfigurationProvider.scala", + extra_data={"scala_package": "org.apache.flume.node"}, + ) + from2 = make_resource_file( + self.project1, + path="from/flume-ng-node-1.9.0-sources.jar-extract", + ) + to1 = make_resource_file( + self.project1, + path="to/flume-ng-node-1.9.0.jar-extract/org/apache/flume/node/" + "AbstractConfigurationProvider.class", + ) + make_resource_file( + self.project1, + path="to/flume-ng-node-1.9.0.jar-extract/META-INF/MANIFEST.MF", + ) + to_jar = make_resource_file( + self.project1, + path="to/flume-ng-node-1.9.0.jar", + ) + + buffer = io.StringIO() + d2d.map_jvm_to_class( + self.project1, logger=buffer.write, jvm_lang=jvm.ScalaLanguage + ) + relation = self.project1.codebaserelations.get() + self.assertEqual(from1, relation.from_resource) + self.assertEqual(to1, relation.to_resource) + self.assertEqual("scala_to_class", relation.map_type) + expected = {"from_source_root": "from/flume-ng-node-1.9.0-sources.jar-extract/"} + self.assertEqual(expected, relation.extra_data) + + buffer = io.StringIO() + with self.assertNumQueries(6): + d2d.map_jar_to_jvm_source( + self.project1, logger=buffer.write, jvm_lang=jvm.ScalaLanguage + ) + expected = "Mapping 1 .jar resources using map_jar_to_source" + self.assertIn(expected, buffer.getvalue()) + + self.assertEqual(2, self.project1.codebaserelations.count()) + relation = self.project1.codebaserelations.get(map_type="jar_to_source") + self.assertEqual(from2, relation.from_resource) + self.assertEqual(to_jar, relation.to_resource) + + def test_scanpipe_pipes_d2d_map_jar_to_kotlin_source(self): + from1 = make_resource_file( + self.project1, + path="from/flume-ng-node-1.9.0-sources.jar-extract/org/apache/flume/node/" + "AbstractConfigurationProvider.kt", + extra_data={"kotlin_package": "org.apache.flume.node"}, + ) + from2 = make_resource_file( + self.project1, + path="from/flume-ng-node-1.9.0-sources.jar-extract", + ) + to1 = make_resource_file( + self.project1, + path="to/flume-ng-node-1.9.0.jar-extract/org/apache/flume/node/" + "AbstractConfigurationProvider.class", + ) + make_resource_file( + self.project1, + path="to/flume-ng-node-1.9.0.jar-extract/META-INF/MANIFEST.MF", + ) + to_jar = make_resource_file( + self.project1, + path="to/flume-ng-node-1.9.0.jar", + ) + + buffer = io.StringIO() + d2d.map_jvm_to_class( + self.project1, logger=buffer.write, jvm_lang=jvm.KotlinLanguage + ) + relation = self.project1.codebaserelations.get() + self.assertEqual(from1, relation.from_resource) + self.assertEqual(to1, relation.to_resource) + self.assertEqual("kotlin_to_class", relation.map_type) + expected = {"from_source_root": "from/flume-ng-node-1.9.0-sources.jar-extract/"} + self.assertEqual(expected, relation.extra_data) + + buffer = io.StringIO() + with self.assertNumQueries(6): + d2d.map_jar_to_jvm_source( + self.project1, logger=buffer.write, jvm_lang=jvm.KotlinLanguage + ) + expected = "Mapping 1 .jar resources using map_jar_to_source" + self.assertIn(expected, buffer.getvalue()) + + self.assertEqual(2, self.project1.codebaserelations.count()) + relation = self.project1.codebaserelations.get(map_type="jar_to_source") + self.assertEqual(from2, relation.from_resource) + self.assertEqual(to_jar, relation.to_resource) def test_scanpipe_pipes_d2d_map_jar_to_source_works_for_jar(self): from1 = make_resource_file( From 61ec8671a81c2ce1a5d022f504aed6135eed2180 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Tue, 7 Oct 2025 19:33:05 +0530 Subject: [PATCH 3/5] Fix formatting Signed-off-by: Tushar Goel --- scanpipe/tests/pipes/test_d2d.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpipe/tests/pipes/test_d2d.py b/scanpipe/tests/pipes/test_d2d.py index 54e3c1230a..120c213272 100644 --- a/scanpipe/tests/pipes/test_d2d.py +++ b/scanpipe/tests/pipes/test_d2d.py @@ -450,7 +450,7 @@ def test_scanpipe_pipes_d2d_map_jar_to_java_source(self): relation = self.project1.codebaserelations.get(map_type="jar_to_source") self.assertEqual(from2, relation.from_resource) self.assertEqual(to_jar, relation.to_resource) - + def test_scanpipe_pipes_d2d_map_jar_to_scala_source(self): from1 = make_resource_file( self.project1, @@ -499,7 +499,7 @@ def test_scanpipe_pipes_d2d_map_jar_to_scala_source(self): relation = self.project1.codebaserelations.get(map_type="jar_to_source") self.assertEqual(from2, relation.from_resource) self.assertEqual(to_jar, relation.to_resource) - + def test_scanpipe_pipes_d2d_map_jar_to_kotlin_source(self): from1 = make_resource_file( self.project1, From ccae7100d32ca0ea0eeb8e4b959f49e6b1adc6b9 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 8 Oct 2025 14:27:54 +0530 Subject: [PATCH 4/5] Handle path normalization for kotlin Signed-off-by: Tushar Goel --- scanpipe/pipes/jvm.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/scanpipe/pipes/jvm.py b/scanpipe/pipes/jvm.py index ede6f3fb54..d56a32c2eb 100644 --- a/scanpipe/pipes/jvm.py +++ b/scanpipe/pipes/jvm.py @@ -163,7 +163,7 @@ class JavaLanguage(JvmLanguage): source_extensions = (".java",) binary_extensions = (".class",) source_package_attribute_name = "java_package" - package_regex = re.compile(r"^\s*package\s+([\w\.]+)\s*;?") + package_regex = re.compile(r"^\s*package\s+([\w\.]+)\s*;") binary_map_type = "java_to_class" @@ -184,6 +184,27 @@ class KotlinLanguage(JvmLanguage): package_regex = re.compile(r"^\s*package\s+([\w\.]+)\s*;?") binary_map_type = "kotlin_to_class" + @classmethod + def get_normalized_path(cls, path, extension): + """ + Return a normalized JVM file path for ``path`` .class file path string. + Account for inner classes in that their file name is the name of their + outer class. + """ + if not path.endswith(cls.binary_extensions): + raise ValueError( + f"Only path ending with {cls.binary_extensions} are supported." + ) + path = Path(path.strip("/")) + class_name = path.name + if "$" in class_name: # inner class + class_name, _, _ = class_name.partition("$") + else: + class_name, _, _ = class_name.partition(".") # plain .class + if class_name.endswith("Kt"): + class_name = class_name[: -len("Kt")] + return str(path.parent / f"{class_name}{extension}") + def get_fully_qualified_path(jvm_package, filename): """ From 3865f0eecb3aff259fabefb1047b9ac87c3c0d15 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 8 Oct 2025 14:35:34 +0530 Subject: [PATCH 5/5] Add tests for Kotlin and Scala Signed-off-by: Tushar Goel --- scanpipe/pipes/jvm.py | 3 +- scanpipe/tests/pipes/test_jvm.py | 95 ++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/scanpipe/pipes/jvm.py b/scanpipe/pipes/jvm.py index d56a32c2eb..9020b8f0d0 100644 --- a/scanpipe/pipes/jvm.py +++ b/scanpipe/pipes/jvm.py @@ -201,8 +201,7 @@ def get_normalized_path(cls, path, extension): class_name, _, _ = class_name.partition("$") else: class_name, _, _ = class_name.partition(".") # plain .class - if class_name.endswith("Kt"): - class_name = class_name[: -len("Kt")] + class_name = class_name.removesuffix("Kt") return str(path.parent / f"{class_name}{extension}") diff --git a/scanpipe/tests/pipes/test_jvm.py b/scanpipe/tests/pipes/test_jvm.py index b2f819ac4c..a7d9ebf982 100644 --- a/scanpipe/tests/pipes/test_jvm.py +++ b/scanpipe/tests/pipes/test_jvm.py @@ -97,3 +97,98 @@ def test_scanpipe_pipes_jvm_get_normalized_java_path_with_inner_class(self): def test_scanpipe_pipes_jvm_get_fully_qualified_java_path(self): fqjp = jvm.get_fully_qualified_path("org.common", "Bar.java") self.assertEqual("org/common/Bar.java", fqjp) + + +class ScanPipeJvmScalaTest(TestCase): + data = Path(__file__).parent.parent / "data" + + scala_code = """ + package org.apache.logging.log4j.scala + import scala.concurrent.Future + """ + + scala_package_too_far_down = ("\n" * 501) + "package org.apache.logging.log4j.scala" + + def test_scanpipe_pipes_jvm_find_scala_package(self): + package = jvm.ScalaLanguage.find_source_package(self.scala_code.splitlines()) + self.assertEqual({"scala_package": "org.apache.logging.log4j.scala"}, package) + + def test_scanpipe_pipes_jvm_find_scala_package_with_spaces(self): + lines = [" package foo.bar.baz ;"] + package = jvm.ScalaLanguage.find_source_package(lines) + self.assertEqual({"scala_package": "foo.bar.baz"}, package) + + def test_scanpipe_pipes_jvm_find_scala_package_return_None(self): + package = jvm.ScalaLanguage.find_source_package( + self.scala_package_too_far_down.splitlines() + ) + self.assertIsNone(package) + + def test_scanpipe_pipes_jvm_get_normalized_scala_path(self): + njp = jvm.ScalaLanguage.get_normalized_path( + "foo/org/common/Bar.class", ".scala" + ) + self.assertEqual("foo/org/common/Bar.scala", njp) + + def test_scanpipe_pipes_jvm_get_normalized_scala_path_with_inner_class(self): + njp = jvm.ScalaLanguage.get_normalized_path( + "foo/org/common/Bar$inner.class", ".scala" + ) + self.assertEqual("foo/org/common/Bar.scala", njp) + + def test_scanpipe_pipes_jvm_get_fully_qualified_scala_path(self): + fqjp = jvm.get_fully_qualified_path("org.common", "Bar.scala") + self.assertEqual("org/common/Bar.scala", fqjp) + + +class ScanPipeJvmKotlinTest(TestCase): + data = Path(__file__).parent.parent / "data" + + kotlin_code = """ + package org.apache.logging.log4j.kotlin + + import kotlinx.coroutines.Deferred + """ + + kotlin_package_too_far_down = ( + "\n" * 501 + ) + "package org.apache.logging.log4j.kotlin" + + def test_scanpipe_pipes_jvm_find_kotlin_package(self): + package = jvm.KotlinLanguage.find_source_package(self.kotlin_code.splitlines()) + self.assertEqual({"kotlin_package": "org.apache.logging.log4j.kotlin"}, package) + + def test_scanpipe_pipes_jvm_find_kotlin_package_with_spaces(self): + lines = [" package foo.bar.baz "] + package = jvm.KotlinLanguage.find_source_package(lines) + self.assertEqual({"kotlin_package": "foo.bar.baz"}, package) + + def test_scanpipe_pipes_jvm_find_kotlin_package_return_None(self): + package = jvm.KotlinLanguage.find_source_package( + self.kotlin_package_too_far_down.splitlines() + ) + self.assertIsNone(package) + + def test_scanpipe_pipes_jvm_get_normalized_kotlin_path(self): + njp = jvm.KotlinLanguage.get_normalized_path("foo/org/common/Bar.class", ".kt") + self.assertEqual("foo/org/common/Bar.kt", njp) + + def test_scanpipe_pipes_jvm_get_normalized_kotlin_path_with_inner_class(self): + njp = jvm.KotlinLanguage.get_normalized_path( + "foo/org/common/Bar$inner.class", ".kt" + ) + self.assertEqual("foo/org/common/Bar.kt", njp) + njp = jvm.KotlinLanguage.get_normalized_path( + "foo/org/common/BarKt$inner.class", ".kt" + ) + self.assertEqual("foo/org/common/Bar.kt", njp) + + def test_scanpipe_pipes_jvm_get_normalized_kotlin_path_with_Kt_suffix(self): + njp = jvm.KotlinLanguage.get_normalized_path( + "foo/org/common/LoggerKt.class", ".kt" + ) + self.assertEqual("foo/org/common/Logger.kt", njp) + + def test_scanpipe_pipes_jvm_get_fully_qualified_kotlin_path(self): + fqjp = jvm.get_fully_qualified_path("org.common", "Bar.kt") + self.assertEqual("org/common/Bar.kt", fqjp)