Skip to content

Commit 0cade5e

Browse files
committed
add support to store packages/archives locally
Signed-off-by: Varsha U N <[email protected]>
1 parent 53700eb commit 0cade5e

21 files changed

+942
-6
lines changed

scancodeio/settings.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23+
import os
2324
import sys
2425
import tempfile
2526
from pathlib import Path
@@ -348,6 +349,15 @@
348349
PROJECT_DIR("static"),
349350
]
350351

352+
# Media files (Uploaded package archives, etc.)
353+
354+
MEDIA_URL = "/media/"
355+
MEDIA_ROOT = os.path.join(str(ROOT_DIR), "media")
356+
357+
# Package storage settings
358+
359+
ENABLE_PACKAGE_STORAGE = env.bool("ENABLE_PACKAGE_STORAGE", default=False)
360+
351361
# Third-party apps
352362

353363
CRISPY_TEMPLATE_PACK = "bootstrap3"

scancodeio/urls.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

2323
from django.conf import settings
24+
from django.conf.urls.static import static
2425
from django.contrib.auth import views as auth_views
2526
from django.urls import include
2627
from django.urls import path
@@ -54,6 +55,8 @@
5455
path("", RedirectView.as_view(url="project/")),
5556
]
5657

58+
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
59+
5760

5861
if settings.SCANCODEIO_ENABLE_ADMIN_SITE:
5962
urlpatterns.append(path("admin/", admin_site.urls))

scanpipe/forms.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ class Meta:
160160
"pipeline",
161161
"execute_now",
162162
"selected_groups",
163+
"use_local_storage",
163164
]
164165

165166
def __init__(self, *args, **kwargs):
@@ -173,6 +174,11 @@ def __init__(self, *args, **kwargs):
173174
pipeline_choices = scanpipe_app.get_pipeline_choices(include_addon=False)
174175
self.fields["pipeline"].choices = pipeline_choices
175176

177+
self.fields["use_local_storage"].label = "Store packages locally"
178+
self.fields["use_local_storage"].help_text = "If checked, " \
179+
"packages will be stored on the local filesystem."
180+
self.fields["use_local_storage"].widget.attrs.update({"class": "checkbox"})
181+
176182
def clean_name(self):
177183
return " ".join(self.cleaned_data["name"].split())
178184

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Generated by Django 5.1.1 on 2025-05-10 06:55
2+
3+
import django.db.models.deletion
4+
import uuid
5+
from django.db import migrations, models
6+
7+
8+
class Migration(migrations.Migration):
9+
10+
dependencies = [
11+
('scanpipe', '0067_discoveredpackage_notes'),
12+
]
13+
14+
operations = [
15+
migrations.CreateModel(
16+
name='PackageArchive',
17+
fields=[
18+
('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')),
19+
('checksum_sha256', models.CharField(db_index=True, help_text='SHA256 checksum of the package archive file.', max_length=64, unique=True)),
20+
('storage_path', models.CharField(help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024)),
21+
('created_date', models.DateTimeField(auto_now_add=True, help_text='Date when the archive was added to storage.')),
22+
],
23+
options={
24+
'indexes': [models.Index(fields=['checksum_sha256'], name='checksum_idx')],
25+
},
26+
),
27+
migrations.CreateModel(
28+
name='DownloadedPackage',
29+
fields=[
30+
('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')),
31+
('url', models.URLField(blank=True, db_index=True, help_text='URL from which the package was downloaded, if applicable.', max_length=1024)),
32+
('filename', models.CharField(help_text='Name of the package file.', max_length=255)),
33+
('download_date', models.DateTimeField(auto_now_add=True, help_text='Date when the package was downloaded or added.')),
34+
('scan_log', models.TextField(blank=True, help_text='Log output from scanning the package.')),
35+
('scan_date', models.DateTimeField(blank=True, help_text='Date when the package was scanned.', null=True)),
36+
('project', models.ForeignKey(editable=False, on_delete=django.db.models.deletion.CASCADE, related_name='downloadedpackages', to='scanpipe.project')),
37+
('package_archive', models.ForeignKey(help_text='The stored archive file associated with this package.', on_delete=django.db.models.deletion.CASCADE, to='scanpipe.packagearchive')),
38+
],
39+
options={
40+
'indexes': [models.Index(fields=['url'], name='url_idx')],
41+
'constraints': [models.UniqueConstraint(condition=models.Q(('url__gt', '')), fields=('url', 'project'), name='scanpipe_downloadedpackage_unique_url_project')],
42+
},
43+
),
44+
]
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Generated by Django 5.1.1 on 2025-05-12 09:41
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('scanpipe', '0068_packagearchive_downloadedpackage'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='packagearchive',
15+
name='package_file',
16+
field=models.FileField(blank=True, help_text='The actual package archive file (e.g., ZIP or TAR).', null=True, upload_to='packages/'),
17+
),
18+
migrations.AlterField(
19+
model_name='packagearchive',
20+
name='storage_path',
21+
field=models.CharField(blank=True, help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024),
22+
),
23+
]
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Generated by Django 5.1.1 on 2025-05-26 09:19
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('scanpipe', '0069_packagearchive_package_file_and_more'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='project',
15+
name='use_local_storage',
16+
field=models.BooleanField(default=False, help_text='Store packages locally if enabled.'),
17+
),
18+
migrations.AlterField(
19+
model_name='packagearchive',
20+
name='package_file',
21+
field=models.FileField(blank=True, help_text='The actual package archive file ( ZIP or TAR).', null=True, upload_to='packages/'),
22+
),
23+
migrations.AlterField(
24+
model_name='packagearchive',
25+
name='storage_path',
26+
field=models.CharField(blank=True, help_text='Path to the stored archive file', max_length=1024),
27+
),
28+
]

scanpipe/models.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,8 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model):
561561
notes = models.TextField(blank=True)
562562
settings = models.JSONField(default=dict, blank=True)
563563
labels = TaggableManager(through=UUIDTaggedItem)
564-
564+
use_local_storage = models.BooleanField(default=False,
565+
help_text="Store packages locally if enabled.")
565566
objects = ProjectQuerySet.as_manager()
566567

567568
class Meta:
@@ -4134,6 +4135,102 @@ def success(self):
41344135
return self.response_status_code in (200, 201, 202)
41354136

41364137

4138+
class PackageArchive(UUIDPKModel):
4139+
"""
4140+
Stores metadata about a package archive file stored in the project's storage.
4141+
Each archive is uniquely identified by its SHA256 checksum.
4142+
"""
4143+
4144+
checksum_sha256 = models.CharField(
4145+
max_length=64,
4146+
unique=True,
4147+
db_index=True,
4148+
help_text=_("SHA256 checksum of the package archive file."),
4149+
)
4150+
storage_path = models.CharField(
4151+
max_length=1024,
4152+
blank=True,
4153+
help_text=_("Path to the stored archive file"),
4154+
)
4155+
package_file = models.FileField(
4156+
upload_to="packages/",
4157+
null=True,
4158+
blank=True,
4159+
help_text=_("The actual package archive file ( ZIP or TAR)."),
4160+
)
4161+
created_date = models.DateTimeField(
4162+
auto_now_add=True,
4163+
help_text=_("Date when the archive was added to storage."),
4164+
)
4165+
4166+
class Meta:
4167+
indexes = [
4168+
models.Index(fields=["checksum_sha256"], name="checksum_idx"),
4169+
]
4170+
4171+
def __str__(self):
4172+
return f"Archive {self.checksum_sha256[:8]} at {self.storage_path
4173+
or self.package_file.name}"
4174+
4175+
4176+
class DownloadedPackage(UUIDPKModel):
4177+
"""
4178+
Tracks packages downloaded or provided as input for a project, linked to a
4179+
PackageArchive. Each instance represents a package associated with a project,
4180+
including its source URL (if downloaded) and scan details.
4181+
"""
4182+
4183+
project = models.ForeignKey(
4184+
Project,
4185+
related_name="downloadedpackages",
4186+
on_delete=models.CASCADE,
4187+
editable=False,
4188+
)
4189+
url = models.URLField(
4190+
max_length=1024,
4191+
db_index=True,
4192+
blank=True,
4193+
help_text=_("URL from which the package was downloaded, if applicable."),
4194+
)
4195+
filename = models.CharField(
4196+
max_length=255,
4197+
help_text=_("Name of the package file."),
4198+
)
4199+
download_date = models.DateTimeField(
4200+
auto_now_add=True,
4201+
help_text=_("Date when the package was downloaded or added."),
4202+
)
4203+
scan_log = models.TextField(
4204+
blank=True,
4205+
help_text=_("Log output from scanning the package."),
4206+
)
4207+
scan_date = models.DateTimeField(
4208+
null=True,
4209+
blank=True,
4210+
help_text=_("Date when the package was scanned."),
4211+
)
4212+
package_archive = models.ForeignKey(
4213+
PackageArchive,
4214+
on_delete=models.CASCADE,
4215+
help_text=_("The stored archive file associated with this package."),
4216+
)
4217+
4218+
class Meta:
4219+
indexes = [
4220+
models.Index(fields=["url"], name="url_idx"),
4221+
]
4222+
constraints = [
4223+
models.UniqueConstraint(
4224+
fields=["url", "project"],
4225+
condition=Q(url__gt=""),
4226+
name="%(app_label)s_%(class)s_unique_url_project",
4227+
),
4228+
]
4229+
4230+
def __str__(self):
4231+
return f"{self.filename} for project {self.project.name}"
4232+
4233+
41374234
@receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL)
41384235
def create_auth_token(sender, instance=None, created=False, **kwargs):
41394236
"""Create an API key token on user creation, using the signal system."""

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23+
import logging
24+
from pathlib import Path
25+
2326
from aboutcode.pipeline import group
2427
from scanpipe import pipes
2528
from scanpipe.pipelines import Pipeline
@@ -28,7 +31,9 @@
2831
from scanpipe.pipes import matchcode
2932
from scanpipe.pipes import purldb
3033
from scanpipe.pipes import scancode
34+
from scanpipe.pipes.fetch import store_package_archive
3135

36+
logger = logging.getLogger(__name__)
3237

3338
class DeployToDevelop(Pipeline):
3439
"""
@@ -59,6 +64,7 @@ def steps(cls):
5964
cls.extract_inputs_to_codebase_directory,
6065
cls.extract_archives,
6166
cls.collect_and_create_codebase_resources,
67+
cls.store_package_archives,
6268
cls.fingerprint_codebase_directories,
6369
cls.flag_empty_files,
6470
cls.flag_whitespace_files,
@@ -116,6 +122,40 @@ def steps(cls):
116122
".odp",
117123
]
118124

125+
def store_package_archives(self):
126+
"""Store package archives locally if enabled."""
127+
if not self.project.use_local_storage:
128+
logger.info(
129+
f"Local storage is disabled for project: {self.project.name}."
130+
"Skipping package storage.")
131+
return []
132+
133+
logger.info(f"Storing package archives for project: {self.project.name}")
134+
stored_files = []
135+
package_files = [
136+
resource.path
137+
for resource in self.project.codebaseresources.filter(
138+
extension__in=self.purldb_package_extensions
139+
)
140+
]
141+
142+
for package_path in package_files:
143+
if not Path(package_path).exists():
144+
logger.error(f"Invalid or missing package path: {package_path}")
145+
continue
146+
package_path_str = str(package_path)
147+
logger.info(f"Storing package archive: {package_path_str}")
148+
try:
149+
result = store_package_archive(
150+
self.project, url=None, file_path=package_path_str
151+
)
152+
logger.info(f"Stored package archive {package_path_str}: {result}")
153+
stored_files.append(result)
154+
except Exception as e:
155+
logger.error(f"Failed to store {package_path_str}: {e}")
156+
157+
return stored_files
158+
119159
def get_inputs(self):
120160
"""Locate the ``from`` and ``to`` input files."""
121161
self.from_files, self.to_files = d2d.get_inputs(self.project)

0 commit comments

Comments
 (0)