Skip to content

Commit 993d80c

Browse files
committed
add support to store packages/archives locally
Signed-off-by: Varsha U N <[email protected]>
1 parent d9875ff commit 993d80c

21 files changed

+870
-5
lines changed

scancodeio/settings.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23+
import os
2324
import sys
2425
import tempfile
2526
from pathlib import Path
@@ -367,6 +368,15 @@
367368
PROJECT_DIR("static"),
368369
]
369370

371+
# Media files (Uploaded package archives, etc.)
372+
373+
MEDIA_URL = "/media/"
374+
MEDIA_ROOT = os.path.join(str(ROOT_DIR), "media")
375+
376+
# Package storage settings
377+
378+
ENABLE_PACKAGE_STORAGE = env.bool("ENABLE_PACKAGE_STORAGE", default=False)
379+
370380
# Third-party apps
371381

372382
CRISPY_TEMPLATE_PACK = "bootstrap3"

scancodeio/urls.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

2323
from django.conf import settings
24+
from django.conf.urls.static import static
2425
from django.contrib.auth import views as auth_views
2526
from django.urls import include
2627
from django.urls import path
@@ -54,6 +55,8 @@
5455
path("", RedirectView.as_view(url="project/")),
5556
]
5657

58+
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
59+
5760

5861
if settings.SCANCODEIO_ENABLE_ADMIN_SITE:
5962
urlpatterns.append(path("admin/", admin_site.urls))

scanpipe/forms.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ class Meta:
164164
"pipeline",
165165
"execute_now",
166166
"selected_groups",
167+
"use_local_storage",
167168
]
168169

169170
def __init__(self, *args, **kwargs):
@@ -177,6 +178,11 @@ def __init__(self, *args, **kwargs):
177178
pipeline_choices = scanpipe_app.get_pipeline_choices(include_addon=False)
178179
self.fields["pipeline"].choices = pipeline_choices
179180

181+
self.fields["use_local_storage"].label = "Store packages locally"
182+
self.fields["use_local_storage"].help_text = "If checked, " \
183+
"packages will be stored on the local filesystem."
184+
self.fields["use_local_storage"].widget.attrs.update({"class": "checkbox"})
185+
180186
def clean_name(self):
181187
return " ".join(self.cleaned_data["name"].split())
182188

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Generated by Django 5.1.1 on 2025-05-10 06:55
2+
3+
import django.db.models.deletion
4+
import uuid
5+
from django.db import migrations, models
6+
7+
8+
class Migration(migrations.Migration):
9+
10+
dependencies = [
11+
('scanpipe', '0067_discoveredpackage_notes'),
12+
]
13+
14+
operations = [
15+
migrations.CreateModel(
16+
name='PackageArchive',
17+
fields=[
18+
('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')),
19+
('checksum_sha256', models.CharField(db_index=True, help_text='SHA256 checksum of the package archive file.', max_length=64, unique=True)),
20+
('storage_path', models.CharField(help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024)),
21+
('created_date', models.DateTimeField(auto_now_add=True, help_text='Date when the archive was added to storage.')),
22+
],
23+
options={
24+
'indexes': [models.Index(fields=['checksum_sha256'], name='checksum_idx')],
25+
},
26+
),
27+
migrations.CreateModel(
28+
name='DownloadedPackage',
29+
fields=[
30+
('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')),
31+
('url', models.URLField(blank=True, db_index=True, help_text='URL from which the package was downloaded, if applicable.', max_length=1024)),
32+
('filename', models.CharField(help_text='Name of the package file.', max_length=255)),
33+
('download_date', models.DateTimeField(auto_now_add=True, help_text='Date when the package was downloaded or added.')),
34+
('scan_log', models.TextField(blank=True, help_text='Log output from scanning the package.')),
35+
('scan_date', models.DateTimeField(blank=True, help_text='Date when the package was scanned.', null=True)),
36+
('project', models.ForeignKey(editable=False, on_delete=django.db.models.deletion.CASCADE, related_name='downloadedpackages', to='scanpipe.project')),
37+
('package_archive', models.ForeignKey(help_text='The stored archive file associated with this package.', on_delete=django.db.models.deletion.CASCADE, to='scanpipe.packagearchive')),
38+
],
39+
options={
40+
'indexes': [models.Index(fields=['url'], name='url_idx')],
41+
'constraints': [models.UniqueConstraint(condition=models.Q(('url__gt', '')), fields=('url', 'project'), name='scanpipe_downloadedpackage_unique_url_project')],
42+
},
43+
),
44+
]
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Generated by Django 5.1.1 on 2025-05-12 09:41
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('scanpipe', '0068_packagearchive_downloadedpackage'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='packagearchive',
15+
name='package_file',
16+
field=models.FileField(blank=True, help_text='The actual package archive file (e.g., ZIP or TAR).', null=True, upload_to='packages/'),
17+
),
18+
migrations.AlterField(
19+
model_name='packagearchive',
20+
name='storage_path',
21+
field=models.CharField(blank=True, help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024),
22+
),
23+
]
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Generated by Django 5.1.1 on 2025-05-26 09:19
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('scanpipe', '0069_packagearchive_package_file_and_more'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='project',
15+
name='use_local_storage',
16+
field=models.BooleanField(default=False, help_text='Store packages locally if enabled.'),
17+
),
18+
migrations.AlterField(
19+
model_name='packagearchive',
20+
name='package_file',
21+
field=models.FileField(blank=True, help_text='The actual package archive file ( ZIP or TAR).', null=True, upload_to='packages/'),
22+
),
23+
migrations.AlterField(
24+
model_name='packagearchive',
25+
name='storage_path',
26+
field=models.CharField(blank=True, help_text='Path to the stored archive file', max_length=1024),
27+
),
28+
]

scanpipe/models.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,7 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model):
585585
)
586586
notes = models.TextField(blank=True)
587587
settings = models.JSONField(default=dict, blank=True)
588+
<<<<<<< HEAD
588589
labels = TaggableManager(through=UUIDTaggedItem, ordering=["name"])
589590
purl = models.CharField(
590591
max_length=2048,
@@ -597,6 +598,11 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model):
597598
),
598599
)
599600

601+
=======
602+
labels = TaggableManager(through=UUIDTaggedItem)
603+
use_local_storage = models.BooleanField(default=False,
604+
help_text="Store packages locally if enabled.")
605+
>>>>>>> 0cade5ed (add support to store packages/archives locally)
600606
objects = ProjectQuerySet.as_manager()
601607

602608
class Meta:
@@ -4386,6 +4392,102 @@ def success(self):
43864392
return self.response_status_code in (200, 201, 202)
43874393

43884394

4395+
class PackageArchive(UUIDPKModel):
4396+
"""
4397+
Stores metadata about a package archive file stored in the project's storage.
4398+
Each archive is uniquely identified by its SHA256 checksum.
4399+
"""
4400+
4401+
checksum_sha256 = models.CharField(
4402+
max_length=64,
4403+
unique=True,
4404+
db_index=True,
4405+
help_text=_("SHA256 checksum of the package archive file."),
4406+
)
4407+
storage_path = models.CharField(
4408+
max_length=1024,
4409+
blank=True,
4410+
help_text=_("Path to the stored archive file"),
4411+
)
4412+
package_file = models.FileField(
4413+
upload_to="packages/",
4414+
null=True,
4415+
blank=True,
4416+
help_text=_("The actual package archive file ( ZIP or TAR)."),
4417+
)
4418+
created_date = models.DateTimeField(
4419+
auto_now_add=True,
4420+
help_text=_("Date when the archive was added to storage."),
4421+
)
4422+
4423+
class Meta:
4424+
indexes = [
4425+
models.Index(fields=["checksum_sha256"], name="checksum_idx"),
4426+
]
4427+
4428+
def __str__(self):
4429+
return f"Archive {self.checksum_sha256[:8]} at {self.storage_path
4430+
or self.package_file.name}"
4431+
4432+
4433+
class DownloadedPackage(UUIDPKModel):
4434+
"""
4435+
Tracks packages downloaded or provided as input for a project, linked to a
4436+
PackageArchive. Each instance represents a package associated with a project,
4437+
including its source URL (if downloaded) and scan details.
4438+
"""
4439+
4440+
project = models.ForeignKey(
4441+
Project,
4442+
related_name="downloadedpackages",
4443+
on_delete=models.CASCADE,
4444+
editable=False,
4445+
)
4446+
url = models.URLField(
4447+
max_length=1024,
4448+
db_index=True,
4449+
blank=True,
4450+
help_text=_("URL from which the package was downloaded, if applicable."),
4451+
)
4452+
filename = models.CharField(
4453+
max_length=255,
4454+
help_text=_("Name of the package file."),
4455+
)
4456+
download_date = models.DateTimeField(
4457+
auto_now_add=True,
4458+
help_text=_("Date when the package was downloaded or added."),
4459+
)
4460+
scan_log = models.TextField(
4461+
blank=True,
4462+
help_text=_("Log output from scanning the package."),
4463+
)
4464+
scan_date = models.DateTimeField(
4465+
null=True,
4466+
blank=True,
4467+
help_text=_("Date when the package was scanned."),
4468+
)
4469+
package_archive = models.ForeignKey(
4470+
PackageArchive,
4471+
on_delete=models.CASCADE,
4472+
help_text=_("The stored archive file associated with this package."),
4473+
)
4474+
4475+
class Meta:
4476+
indexes = [
4477+
models.Index(fields=["url"], name="url_idx"),
4478+
]
4479+
constraints = [
4480+
models.UniqueConstraint(
4481+
fields=["url", "project"],
4482+
condition=Q(url__gt=""),
4483+
name="%(app_label)s_%(class)s_unique_url_project",
4484+
),
4485+
]
4486+
4487+
def __str__(self):
4488+
return f"{self.filename} for project {self.project.name}"
4489+
4490+
43894491
@receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL)
43904492
def create_auth_token(sender, instance=None, created=False, **kwargs):
43914493
"""Create an API key token on user creation, using the signal system."""

scanpipe/pipelines/analyze_docker.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
from scanpipe.pipelines.analyze_root_filesystem import RootFS
2424
from scanpipe.pipes import docker
2525
from scanpipe.pipes import rootfs
26+
from scanpipe.pipes.fetch import store_package_archive
27+
28+
logger = logging.getLogger(__name__)
2629

2730

2831
class Docker(RootFS):
@@ -36,6 +39,7 @@ def steps(cls):
3639
cls.find_images_os_and_distro,
3740
cls.collect_images_information,
3841
cls.collect_and_create_codebase_resources,
42+
cls.store_package_archives,
3943
cls.collect_and_create_system_packages,
4044
cls.flag_uninteresting_codebase_resources,
4145
cls.flag_empty_files,
@@ -74,6 +78,38 @@ def collect_and_create_codebase_resources(self):
7478
"""Collect and labels all image files as CodebaseResources."""
7579
for image in self.images:
7680
docker.create_codebase_resources(self.project, image)
81+
self.package_files = []
82+
for resource in self.project.codebaseresources.filter(extension=".deb"):
83+
self.package_files.append(resource.path)
84+
logger.debug(f"Found package file: {resource.path}")
85+
86+
def store_package_archives(self):
87+
"""Store identified package archives."""
88+
if not self.project.use_local_storage:
89+
logger.info(f"Local storage is disabled for project: {self.project.name}."
90+
"Skipping package storage.")
91+
return []
92+
93+
logger.info(
94+
f"Storing package archives for project: {self.project.name},"
95+
"files: {self.package_files}"
96+
)
97+
stored_files = []
98+
for package_path in self.package_files:
99+
if not Path(package_path).exists():
100+
logger.error(f"Invalid or missing package path: {package_path}")
101+
continue
102+
package_path_str = str(package_path)
103+
logger.info(f"Storing package archive: {package_path_str}")
104+
try:
105+
result = store_package_archive(
106+
self.project, url=None, file_path=package_path_str
107+
)
108+
logger.info(f"Stored package archive {package_path_str}: {result}")
109+
stored_files.append(result)
110+
except Exception as e:
111+
logger.error(f"Failed to store {package_path_str}: {e}")
112+
return stored_files
77113

78114
def collect_and_create_system_packages(self):
79115
"""Collect installed system packages for each layer based on the distro."""

scanpipe/pipelines/analyze_docker_windows.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
from scanpipe.pipes import docker
2525
from scanpipe.pipes import rootfs
2626
from scanpipe.pipes import windows
27+
from scanpipe.pipes.fetch import store_package_archive
2728

28-
29+
logger = logging.getLogger(__name__)
2930
class DockerWindows(Docker):
3031
"""Analyze Windows Docker images."""
3132

@@ -37,6 +38,7 @@ def steps(cls):
3738
cls.find_images_os_and_distro,
3839
cls.collect_images_information,
3940
cls.collect_and_create_codebase_resources,
41+
cls.store_package_archives,
4042
cls.collect_and_create_system_packages,
4143
cls.flag_known_software_packages,
4244
cls.flag_uninteresting_codebase_resources,
@@ -50,6 +52,39 @@ def steps(cls):
5052
cls.flag_not_analyzed_codebase_resources,
5153
)
5254

55+
def store_package_archives(self):
56+
"""Store identified package archives for Windows images."""
57+
if not self.project.use_local_storage:
58+
logger.info(f"Local storage is disabled for project: {self.project.name}."
59+
"Skipping package storage.")
60+
return []
61+
62+
logger.info(f"Storing package archives for project: {self.project.name}")
63+
stored_files = []
64+
65+
package_files = [
66+
resource.path
67+
for resource in self.project.codebaseresources.filter(
68+
extension__in=[".msi", ".exe"])
69+
]
70+
71+
for package_path in package_files:
72+
if not Path(package_path).exists():
73+
logger.error(f"Invalid or missing package path: {package_path}")
74+
continue
75+
package_path_str = str(package_path)
76+
logger.info(f"Storing package archive: {package_path_str}")
77+
try:
78+
result = store_package_archive(
79+
self.project, url=None, file_path=package_path_str
80+
)
81+
logger.info(f"Stored package archive {package_path_str}: {result}")
82+
stored_files.append(result)
83+
except Exception as e:
84+
logger.error(f"Failed to store {package_path_str}: {e}")
85+
86+
return stored_files
87+
5388
def flag_known_software_packages(self):
5489
"""Flag files from known software packages by checking common install paths."""
5590
windows.flag_known_software(self.project)

0 commit comments

Comments
 (0)