Skip to content

Commit 35efe84

Browse files
committed
fix CI errors
Signed-off-by: Varsha U N <[email protected]>
1 parent 5476933 commit 35efe84

File tree

6 files changed

+218
-175
lines changed

6 files changed

+218
-175
lines changed

scancodeio/settings.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,13 @@
2323
import sys
2424
import tempfile
2525
from pathlib import Path
26-
from venv import logger
26+
import logging
2727

2828
import environ
2929

3030
from scanpipe.archiving import LocalFilesystemProvider
3131

32+
3233
PROJECT_DIR = environ.Path(__file__) - 1
3334
ROOT_DIR = PROJECT_DIR - 1
3435

@@ -376,9 +377,10 @@
376377

377378
CRISPY_TEMPLATE_PACK = "bootstrap3"
378379

379-
# Storing archives locally (Package Storage settings)
380-
381-
ENABLE_DOWNLOAD_ARCHIVING = env.bool("ENABLE_DOWNLOAD_ARCHIVING", default=False)
380+
# Centralized archive directory for all projects
381+
CENTRAL_ARCHIVE_PATH = env.str(
382+
"CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives"
383+
)
382384

383385
# localstorage configuration
384386
DOWNLOAD_ARCHIVING_PROVIDER = env.str(
@@ -393,15 +395,15 @@
393395
# Initialize the DownloadStore for local storage
394396

395397
download_store = None
396-
if ENABLE_DOWNLOAD_ARCHIVING:
397-
if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage":
398-
config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {}
399-
root_path = Path(config.get("root_path", "/var/scancodeio/downloads"))
400-
try:
401-
download_store = LocalFilesystemProvider(root_path=root_path)
402-
except Exception as e:
403-
logger.error(f"Failed to initialize LocalFilesystemProvider: {e}")
404-
else:
398+
logger = logging.getLogger(__name__)
399+
if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage":
400+
config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {}
401+
root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH))
402+
try:
403+
download_store = LocalFilesystemProvider(root_path=root_path)
404+
except Exception as e:
405+
logger.error(f"Failed to initialize LocalFilesystemProvider: {e}")
406+
else:
405407
logger.error(
406408
f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}"
407409
)

scanpipe/archiving.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def _build_metadata(
6363
"sha256": sha256,
6464
"filename": filename,
6565
"download_date": download_date,
66-
"url": download_url,
66+
"download_url": download_url,
6767
}
6868

6969
@abstractmethod

scanpipe/pipelines/__init__.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import inspect
2424
import logging
2525
import traceback
26+
import hashlib
2627
from contextlib import contextmanager
2728
from datetime import datetime
2829
from functools import wraps
@@ -34,7 +35,6 @@
3435
from pyinstrument import Profiler
3536

3637
from aboutcode.pipeline import BasePipeline
37-
from scancodeio.settings import ENABLE_DOWNLOAD_ARCHIVING
3838
from scancodeio.settings import download_store
3939

4040
logger = logging.getLogger(__name__)
@@ -148,9 +148,24 @@ def download_missing_inputs(self):
148148
error_tracebacks.append((msg, "No traceback available."))
149149
continue
150150

151+
download_url = input_source.download_url
152+
if not download_url:
153+
continue
154+
155+
url_hash = hashlib.sha256(download_url.encode()).hexdigest()
156+
filename = input_source.filename or Path(download_url).name or f"{url_hash}.archive"
157+
archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename
158+
159+
if archive_path.exists():
160+
logger.info(f"Reusing existing archive at {archive_path}")
161+
input_source.file_path = str(archive_path)
162+
input_source.save()
163+
continue
164+
151165
self.log(f"Fetching input from {input_source.download_url}")
152166
try:
153167
input_source.fetch()
168+
154169
except Exception as error:
155170
traceback_str = traceback.format_exc()
156171
logger.error(traceback_str)

scanpipe/pipes/input.py

Lines changed: 59 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
from scanpipe.models import InputSource
4545
from scanpipe.pipes import scancode
4646
from scanpipe.pipes.output import mappings_key_by_fieldname
47-
from scancodeio.settings import ENABLE_DOWNLOAD_ARCHIVING
4847
from scancodeio.settings import download_store
4948

5049
logger = logging.getLogger(__name__)
@@ -262,61 +261,47 @@ def add_input_from_url(project, url, filename=None):
262261
logger.error(f"Failed to download {url}: {e}")
263262
raise
264263

265-
should_archive = (
266-
ENABLE_DOWNLOAD_ARCHIVING == "always"
267-
or (
268-
ENABLE_DOWNLOAD_ARCHIVING == "per_project"
269-
and getattr(project, "archive_downloads", False)
270-
)
271-
or (
272-
ENABLE_DOWNLOAD_ARCHIVING == "per_input"
273-
and "archive" in getattr(project, "input_tags", [])
274-
)
275-
)
264+
filename = filename or url.split("/")[-1] or "downloaded_file"
265+
url_hash = hashlib.sha256(url.encode()).hexdigest()
266+
archive_path = Path(project.settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename
276267

277-
filename = filename or url.split("/")[-1]
278-
if should_archive and download_store:
279-
sha256 = hashlib.sha256(content).hexdigest()
280-
existing_download = download_store.get(sha256)
281-
if not existing_download:
282-
try:
283-
download = download_store.put(
284-
content=content,
285-
download_url=url,
286-
download_date=datetime.now().isoformat(),
287-
filename=filename,
288-
)
289-
except Exception as e:
290-
logger.error(f"Failed to archive download for {url}: {e}")
291-
raise
292-
else:
293-
download = existing_download
294-
295-
InputSource.objects.create(
296-
project=project,
297-
sha256=download.sha256,
298-
download_url=download.download_url,
299-
filename=download.filename,
300-
download_date=download.download_date,
301-
is_uploaded=False,
302-
)
268+
if download_store:
269+
try:
270+
download = download_store.put(
271+
content=content,
272+
download_url=url,
273+
download_date=datetime.now().isoformat(),
274+
filename=filename,
275+
)
276+
InputSource.objects.create(
277+
project=project,
278+
sha256=download.sha256,
279+
download_url=download.download_url,
280+
filename=download.filename,
281+
download_date=download.download_date,
282+
file_path=str(download.path),
283+
is_uploaded=False,
284+
)
285+
except Exception as e:
286+
logger.error(f"Failed to archive download for {url}: {e}")
287+
raise
303288
else:
304289
input_path = project.input_path / filename
305290
try:
291+
input_path.parent.mkdir(parents=True, exist_ok=True)
306292
with open(input_path, "wb") as f:
307293
f.write(content)
294+
InputSource.objects.create(
295+
project=project,
296+
filename=filename,
297+
download_url=url,
298+
file_path=str(input_path),
299+
is_uploaded=False,
300+
)
308301
except Exception as e:
309302
logger.error(f"Failed to save {filename} to {input_path}: {e}")
310303
raise
311304

312-
InputSource.objects.create(
313-
project=project,
314-
filename=filename,
315-
download_url=url,
316-
is_uploaded=False,
317-
)
318-
319-
320305
def add_input_from_upload(project, uploaded_file):
321306
"""
322307
Add an uploaded file as an InputSource for the specified ``project``.
@@ -325,54 +310,38 @@ def add_input_from_upload(project, uploaded_file):
325310
content = uploaded_file.read()
326311
filename = uploaded_file.name
327312

328-
should_archive = (
329-
ENABLE_DOWNLOAD_ARCHIVING == "always"
330-
or (
331-
ENABLE_DOWNLOAD_ARCHIVING == "per_project"
332-
and getattr(project, "archive_downloads", False)
333-
)
334-
or (
335-
ENABLE_DOWNLOAD_ARCHIVING == "per_input"
336-
and "archive" in getattr(project, "input_tags", [])
337-
)
338-
)
339-
340-
if should_archive and download_store:
341-
sha256 = hashlib.sha256(content).hexdigest()
342-
existing_download = download_store.get(sha256)
343-
if not existing_download:
344-
try:
345-
download = download_store.put(
346-
content=content,
347-
download_url="", # No URL for uploads
348-
download_date=datetime.now().isoformat(),
349-
filename=filename,
350-
)
351-
except Exception as e:
352-
logger.error(f"Failed to archive upload {filename}: {e}")
353-
raise
354-
else:
355-
download = existing_download
356-
357-
InputSource.objects.create(
358-
project=project,
359-
sha256=download.sha256,
360-
download_url=download.download_url,
361-
filename=download.filename,
362-
download_date=download.download_date,
363-
is_uploaded=True,
364-
)
313+
if download_store:
314+
try:
315+
download = download_store.put(
316+
content=content,
317+
download_url="",
318+
download_date=datetime.now().isoformat(),
319+
filename=filename,
320+
)
321+
InputSource.objects.create(
322+
project=project,
323+
sha256=download.sha256,
324+
download_url=download.download_url,
325+
filename=download.filename,
326+
download_date=download.download_date,
327+
file_path=str(download.path),
328+
is_uploaded=True,
329+
)
330+
except Exception as e:
331+
logger.error(f"Failed to archive upload {filename}: {e}")
332+
raise
365333
else:
366334
input_path = project.input_path / filename
367335
try:
336+
input_path.parent.mkdir(parents=True, exist_ok=True)
368337
with open(input_path, "wb") as f:
369338
f.write(content)
339+
InputSource.objects.create(
340+
project=project,
341+
filename=filename,
342+
file_path=str(input_path),
343+
is_uploaded=True,
344+
)
370345
except Exception as e:
371346
logger.error(f"Failed to save {filename} to {input_path}: {e}")
372-
raise
373-
374-
InputSource.objects.create(
375-
project=project,
376-
filename=filename,
377-
is_uploaded=True,
378-
)
347+
raise

0 commit comments

Comments
 (0)