Skip to content

Commit 42be421

Browse files
authored
Add support for tracking parent of CodebaseResource entries and ensure top level paths are stored (#1691)
* Add support for storing top-level paths of the codebase Signed-off-by: Aayush Kumar <[email protected]> * Add `ancestor` field to CodebasResource to track parent path of a resource Signed-off-by: Aayush Kumar <[email protected]> * fix line too long error in scanpipe/models.py Signed-off-by: Aayush Kumar <[email protected]> * update tests Signed-off-by: Aayush Kumar <[email protected]> * rename `ancestor` field to `parent_directory_path` Signed-off-by: Aayush Kumar <[email protected]> * add save() method to CodebaseResource to ensure `parent_directory_path` is always set Signed-off-by: Aayush Kumar <[email protected]> * add tests Signed-off-by: Aayush Kumar <[email protected]> * fix code format Signed-off-by: Aayush Kumar <[email protected]> * rename parent_directory_path field to parent_path Signed-off-by: Aayush Kumar <[email protected]> * fix code format Signed-off-by: Aayush Kumar <[email protected]> * minor fixes and adjustments following review feedback Signed-off-by: Aayush Kumar <[email protected]> * Simplify return statement in `parent_directory` for better readability Signed-off-by: Aayush Kumar <[email protected]> * bump migration Signed-off-by: Aayush Kumar <[email protected]> * update parent_path to display root files on empty string instead of None to align with the code format Signed-off-by: Aayush Kumar <[email protected]> * fix `scan_single_package` not giving corect `parent_path` Signed-off-by: Aayush Kumar <[email protected]> * fix code format Signed-off-by: Aayush Kumar <[email protected]> * regen tests Signed-off-by: Aayush Kumar <[email protected]> * bump migration Signed-off-by: Aayush Kumar <[email protected]> * make create_codebase_resource function less complex Signed-off-by: Aayush Kumar <[email protected]> --------- Signed-off-by: Aayush Kumar <[email protected]>
1 parent d56b5eb commit 42be421

File tree

9 files changed

+247
-61
lines changed

9 files changed

+247
-61
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Generated by Django 5.1.9 on 2025-06-16 17:42
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('scanpipe', '0074_discovered_license_models'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='codebaseresource',
15+
name='parent_path',
16+
field=models.CharField(blank=True, help_text='The path of the resource\'s parent directory. Set to None for top-level (root) resources. Used to efficiently retrieve a directory\'s contents.', max_length=2000),
17+
),
18+
migrations.AddIndex(
19+
model_name='codebaseresource',
20+
index=models.Index(fields=['project', 'parent_path'], name='scanpipe_co_project_008448_idx'),
21+
),
22+
]

scanpipe/models.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2745,6 +2745,17 @@ class CodebaseResource(
27452745
'Eg.: "/usr/bin/bash" for a path of "tarball-extract/rootfs/usr/bin/bash"'
27462746
),
27472747
)
2748+
2749+
parent_path = models.CharField(
2750+
max_length=2000,
2751+
blank=True,
2752+
help_text=_(
2753+
"The path of the resource's parent directory. "
2754+
"Set to empty string for top-level (root) resources. "
2755+
"Used to efficiently retrieve a directory's contents."
2756+
),
2757+
)
2758+
27482759
status = models.CharField(
27492760
blank=True,
27502761
max_length=50,
@@ -2838,6 +2849,7 @@ class Meta:
28382849
models.Index(fields=["compliance_alert"]),
28392850
models.Index(fields=["is_binary"]),
28402851
models.Index(fields=["is_text"]),
2852+
models.Index(fields=["project", "parent_path"]),
28412853
]
28422854
constraints = [
28432855
models.UniqueConstraint(
@@ -2850,6 +2862,11 @@ class Meta:
28502862
def __str__(self):
28512863
return self.path
28522864

2865+
def save(self, *args, **kwargs):
2866+
if self.path and not self.parent_path:
2867+
self.parent_path = self.parent_directory() or ""
2868+
super().save(*args, **kwargs)
2869+
28532870
def get_absolute_url(self):
28542871
return reverse("resource_detail", args=[self.project.slug, self.path])
28552872

@@ -2920,7 +2937,8 @@ def get_path_segments_with_subpath(self):
29202937

29212938
def parent_directory(self):
29222939
"""Return the parent path for this CodebaseResource or None."""
2923-
return parent_directory(self.path, with_trail=False)
2940+
parent_path = parent_directory(str(self.path), with_trail=False)
2941+
return parent_path or None
29242942

29252943
def has_parent(self):
29262944
"""

scanpipe/pipes/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ def make_codebase_resource(project, location, save=True, **extra_fields):
7272
from scanpipe.pipes import flag
7373

7474
relative_path = Path(location).relative_to(project.codebase_path)
75+
parent_path = str(relative_path.parent)
76+
77+
if parent_path == ".":
78+
parent_path = ""
79+
7580
try:
7681
resource_data = scancode.get_resource_info(location=str(location))
7782
except OSError as error:
@@ -92,6 +97,7 @@ def make_codebase_resource(project, location, save=True, **extra_fields):
9297
codebase_resource = CodebaseResource(
9398
project=project,
9499
path=relative_path,
100+
parent_path=parent_path,
95101
**resource_data,
96102
)
97103

scanpipe/pipes/rootfs.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,14 @@ def get_res(parent, fname):
139139
rootfs_path=rootfs_path,
140140
)
141141

142+
# Explicitly yields the root directory as a resource when `with_dir` is True
143+
if with_dir:
144+
rootfs_path = "/"
145+
yield Resource(
146+
location=location,
147+
rootfs_path=rootfs_path,
148+
)
149+
142150
for top, dirs, files in os.walk(location):
143151
for f in files:
144152
yield get_res(parent=top, fname=f)

scanpipe/pipes/scancode.py

Lines changed: 73 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -901,80 +901,94 @@ def get_virtual_codebase(project, input_location):
901901
return VirtualCodebase(input_location, temp_dir=str(temp_path), max_in_memory=0)
902902

903903

904-
def create_codebase_resources(project, scanned_codebase):
905-
"""
906-
Save the resources of a ScanCode `scanned_codebase` scancode.resource.Codebase
907-
object to the database as a CodebaseResource of the `project`.
908-
This function can be used to expend an existing `project` Codebase with new
909-
CodebaseResource objects as the existing objects (based on the `path`) will be
910-
skipped.
911-
"""
912-
for scanned_resource in scanned_codebase.walk(skip_root=True):
913-
resource_data = {}
914-
915-
for field in CodebaseResource._meta.fields:
916-
# Do not include the path as provided by the scanned_resource since it
917-
# includes the "root". The `get_path` method is used instead.
918-
if field.name == "path":
919-
continue
920-
value = getattr(scanned_resource, field.name, None)
921-
if value is not None:
922-
resource_data[field.name] = value
923-
924-
resource_type = "FILE" if scanned_resource.is_file else "DIRECTORY"
925-
resource_data["type"] = CodebaseResource.Type[resource_type]
926-
resource_path = scanned_resource.get_path(strip_root=True)
927-
928-
codebase_resource, _ = CodebaseResource.objects.get_or_create(
904+
def create_codebase_resource(project, scanned_resource):
905+
"""Create a CodebaseResource entry from ScanCode scanned data."""
906+
resource_data = {}
907+
908+
for field in CodebaseResource._meta.fields:
909+
# Do not include the path as provided by the scanned_resource since it
910+
# includes the "root". The `get_path` method is used instead.
911+
if field.name in ["path", "parent_path"]:
912+
continue
913+
value = getattr(scanned_resource, field.name, None)
914+
if value is not None:
915+
resource_data[field.name] = value
916+
917+
resource_type = "FILE" if scanned_resource.is_file else "DIRECTORY"
918+
resource_data["type"] = CodebaseResource.Type[resource_type]
919+
resource_path = scanned_resource.get_path(strip_root=True)
920+
921+
parent_path = str(Path(resource_path).parent)
922+
if parent_path == ".":
923+
parent_path = ""
924+
resource_data["parent_path"] = parent_path
925+
926+
codebase_resource, _ = CodebaseResource.objects.get_or_create(
927+
project=project,
928+
path=resource_path,
929+
defaults=resource_data,
930+
)
931+
932+
# Handle package assignments
933+
for_packages = getattr(scanned_resource, "for_packages", [])
934+
for package_uid in for_packages:
935+
logger.debug(f"Assign {package_uid} to {codebase_resource}")
936+
package = project.discoveredpackages.get(package_uid=package_uid)
937+
set_codebase_resource_for_package(
938+
codebase_resource=codebase_resource,
939+
discovered_package=package,
940+
)
941+
942+
# Handle license detections
943+
license_detections = getattr(scanned_resource, "license_detections", [])
944+
for detection_data in license_detections:
945+
detection_identifier = detection_data.get("identifier")
946+
pipes.update_or_create_license_detection(
929947
project=project,
930-
path=resource_path,
931-
defaults=resource_data,
948+
detection_data=detection_data,
949+
resource_path=resource_path,
950+
count_detection=False,
932951
)
952+
logger.debug(f"Add {codebase_resource} to {detection_identifier}")
933953

934-
for_packages = getattr(scanned_resource, "for_packages", [])
935-
for package_uid in for_packages:
936-
logger.debug(f"Assign {package_uid} to {codebase_resource}")
937-
package = project.discoveredpackages.get(package_uid=package_uid)
938-
set_codebase_resource_for_package(
939-
codebase_resource=codebase_resource,
940-
discovered_package=package,
941-
)
954+
# Handle license clues
955+
license_clues = getattr(scanned_resource, "license_clues", [])
956+
for clue_data in license_clues:
957+
pipes.update_or_create_license_detection(
958+
project=project,
959+
detection_data=clue_data,
960+
resource_path=resource_path,
961+
is_license_clue=True,
962+
)
963+
logger.debug(f"Add license clue at {codebase_resource}")
942964

943-
license_detections = getattr(scanned_resource, "license_detections", [])
965+
# Handle package data
966+
packages = getattr(scanned_resource, "package_data", [])
967+
for package_data in packages:
968+
license_detections = package_data.get("license_detections", [])
969+
license_detections.extend(package_data.get("other_license_detections", []))
944970
for detection_data in license_detections:
945971
detection_identifier = detection_data.get("identifier")
946972
pipes.update_or_create_license_detection(
947973
project=project,
948974
detection_data=detection_data,
949975
resource_path=resource_path,
950976
count_detection=False,
977+
from_package=True,
951978
)
952979
logger.debug(f"Add {codebase_resource} to {detection_identifier}")
953980

954-
license_clues = getattr(scanned_resource, "license_clues", [])
955-
for clue_data in license_clues:
956-
pipes.update_or_create_license_detection(
957-
project=project,
958-
detection_data=clue_data,
959-
resource_path=resource_path,
960-
is_license_clue=True,
961-
)
962-
logger.debug(f"Add license clue at {codebase_resource}")
963981

964-
packages = getattr(scanned_resource, "package_data", [])
965-
for package_data in packages:
966-
license_detections = package_data.get("license_detections", [])
967-
license_detections.extend(package_data.get("other_license_detections", []))
968-
for detection_data in license_detections:
969-
detection_identifier = detection_data.get("identifier")
970-
pipes.update_or_create_license_detection(
971-
project=project,
972-
detection_data=detection_data,
973-
resource_path=resource_path,
974-
count_detection=False,
975-
from_package=True,
976-
)
977-
logger.debug(f"Add {codebase_resource} to {detection_identifier}")
982+
def create_codebase_resources(project, scanned_codebase):
983+
"""
984+
Save the resources of a ScanCode `scanned_codebase` scancode.resource.Codebase
985+
object to the database as a CodebaseResource of the `project`.
986+
This function can be used to expend an existing `project` Codebase with new
987+
CodebaseResource objects as the existing objects (based on the `path`) will be
988+
skipped.
989+
"""
990+
for scanned_resource in scanned_codebase.walk(skip_root=True):
991+
create_codebase_resource(project, scanned_resource)
978992

979993

980994
def create_discovered_packages(project, scanned_codebase):

scanpipe/tests/data/rootfs/basic-rootfs_root_filesystems.json

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,42 @@
340340
],
341341
"dependencies": [],
342342
"files": [
343+
{
344+
"path": "basic-rootfs.tar.gz-extract",
345+
"type": "directory",
346+
"name": "basic-rootfs.tar.gz-extract",
347+
"status": "scanned",
348+
"for_packages": [],
349+
"tag": "",
350+
"extension": ".tar.gz-extract",
351+
"programming_language": "",
352+
"detected_license_expression": "",
353+
"detected_license_expression_spdx": "",
354+
"license_detections": [],
355+
"license_clues": [],
356+
"percentage_of_license_text": null,
357+
"copyrights": [],
358+
"holders": [],
359+
"authors": [],
360+
"package_data": [],
361+
"emails": [],
362+
"urls": [],
363+
"md5": "",
364+
"sha1": "",
365+
"sha256": "",
366+
"sha512": "",
367+
"sha1_git": "",
368+
"is_binary": false,
369+
"is_text": false,
370+
"is_archive": false,
371+
"is_media": false,
372+
"is_legal": false,
373+
"is_manifest": false,
374+
"is_readme": false,
375+
"is_top_level": false,
376+
"is_key_file": false,
377+
"extra_data": {}
378+
},
343379
{
344380
"path": "basic-rootfs.tar.gz-extract/etc",
345381
"type": "directory",

scanpipe/tests/pipes/test_scancode.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,3 +723,21 @@ def test_scanpipe_scancode_resolve_dependencies_no_requirements(self):
723723
resolved_dep = project1.discovereddependencies.get(name="bluebird")
724724
self.assertEqual(resolved_dep, dep_2)
725725
self.assertEqual(resolved_dep.resolved_to_package, pkg_1)
726+
727+
def test_scanpipe_pipes_scancode_scan_single_package_correct_parent_path(self):
728+
project1 = Project.objects.create(name="Analysis")
729+
input_location = self.data / "scancode" / "is-npm-1.0.0.tgz"
730+
project1.copy_input_from(input_location)
731+
run = project1.add_pipeline("scan_single_package")
732+
pipeline = run.make_pipeline_instance()
733+
exitcode, out = pipeline.execute()
734+
735+
self.assertEqual(0, exitcode, msg=out)
736+
self.assertEqual(4, project1.codebaseresources.count())
737+
738+
root = project1.codebaseresources.get(path="package")
739+
self.assertEqual("", root.parent_path)
740+
self.assertNotEqual("codebase", root.parent_path)
741+
742+
file1 = project1.codebaseresources.get(path="package/index.js")
743+
self.assertEqual("package", file1.parent_path)

scanpipe/tests/test_models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,6 +1646,16 @@ def test_scanpipe_can_compute_compliance_alert_for_license_exceptions(self):
16461646
resource.update(detected_license_expression=license_expression)
16471647
self.assertEqual("warning", resource.compute_compliance_alert())
16481648

1649+
def test_scanpipe_codebase_root_parent_path(self):
1650+
resource1 = self.project1.codebaseresources.create(path="file")
1651+
1652+
self.assertEqual("", resource1.parent_path)
1653+
1654+
def test_scanpipe_codebase_regular_parent_path(self):
1655+
resource2 = self.project1.codebaseresources.create(path="dir1/dir2/file")
1656+
1657+
self.assertEqual("dir1/dir2", resource2.parent_path)
1658+
16491659
def test_scanpipe_scan_fields_model_mixin_methods(self):
16501660
expected = [
16511661
"detected_license_expression",

0 commit comments

Comments
 (0)