Skip to content

Commit 5476933

Browse files
committed
modify the required imports
Signed-off-by: Varsha U N <[email protected]>
1 parent fa5eb25 commit 5476933

File tree

6 files changed

+9
-320
lines changed

6 files changed

+9
-320
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,4 @@ COPY --chown=$APP_USER:$APP_USER pyproject.toml $APP_DIR/
9191
RUN pip install --no-cache-dir .
9292

9393
# Copy the codebase and set the proper permissions for the APP_USER
94-
COPY --chown=$APP_USER:$APP_USER . $APP_DIR
94+
COPY --chown=$APP_USER:$APP_USER . $APP_DIR

scancodeio/settings.py

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@
2828
import environ
2929

3030
from scanpipe.archiving import LocalFilesystemProvider
31-
from scanpipe.archiving import S3LikeProvider
32-
from scanpipe.archiving import SftpProvider
3331

3432
PROJECT_DIR = environ.Path(__file__) - 1
3533
ROOT_DIR = PROJECT_DIR - 1
@@ -378,11 +376,11 @@
378376

379377
CRISPY_TEMPLATE_PACK = "bootstrap3"
380378

381-
# Storing archives locally or in S3 (Package Storage settings)
379+
# Storing archives locally (Package Storage settings)
382380

383381
ENABLE_DOWNLOAD_ARCHIVING = env.bool("ENABLE_DOWNLOAD_ARCHIVING", default=False)
384382

385-
# localstorage, s3, sftp
383+
# localstorage configuration
386384
DOWNLOAD_ARCHIVING_PROVIDER = env.str(
387385
"DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage"
388386
)
@@ -392,7 +390,7 @@
392390
"DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None
393391
)
394392

395-
# Initialize the DownloadStore based on provider
393+
# Initialize the DownloadStore for local storage
396394

397395
download_store = None
398396
if ENABLE_DOWNLOAD_ARCHIVING:
@@ -403,41 +401,6 @@
403401
download_store = LocalFilesystemProvider(root_path=root_path)
404402
except Exception as e:
405403
logger.error(f"Failed to initialize LocalFilesystemProvider: {e}")
406-
elif DOWNLOAD_ARCHIVING_PROVIDER == "s3":
407-
config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {}
408-
required_keys = ["bucket_name", "aws_userid", "aws_apikey"]
409-
if not all(key in config for key in required_keys):
410-
logger.error(
411-
f"S3 provider requires {required_keys}"
412-
"in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION"
413-
)
414-
else:
415-
try:
416-
download_store = S3LikeProvider(
417-
bucket_name=config.get("bucket_name"),
418-
aws_userid=config.get("aws_userid"),
419-
aws_apikey=config.get("aws_apikey"),
420-
other_aws_credentials=config.get("other_aws_credentials", {}),
421-
)
422-
except Exception as e:
423-
logger.error(f"Failed to initialize S3LikeProvider: {e}")
424-
elif DOWNLOAD_ARCHIVING_PROVIDER == "sftp":
425-
config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {}
426-
required_keys = ["host", "root_path", "ssh_credentials"]
427-
if not all(key in config for key in required_keys):
428-
logger.error(
429-
f"SFTP provider requires {required_keys}"
430-
"in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION"
431-
)
432-
else:
433-
try:
434-
download_store = SftpProvider(
435-
host=config.get("host"),
436-
root_path=config.get("root_path"),
437-
ssh_credentials=config.get("ssh_credentials", {}),
438-
)
439-
except Exception as e:
440-
logger.error(f"Failed to initialize SftpProvider: {e}")
441404
else:
442405
logger.error(
443406
f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}"

scanpipe/archiving.py

Lines changed: 0 additions & 274 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@
3131
from dataclasses import dataclass
3232
from pathlib import Path
3333

34-
import boto3
35-
import paramiko
36-
from botocore.exceptions import ClientError
37-
from paramiko.ssh_exception import SSHException
3834

3935
logger = logging.getLogger(__name__)
4036

@@ -192,273 +188,3 @@ def find(
192188
return None
193189

194190

195-
class S3LikeProvider(DownloadStore):
196-
def __init__(
197-
self,
198-
bucket_name: str,
199-
aws_userid: str,
200-
aws_apikey: str,
201-
other_aws_credentials: dict,
202-
):
203-
self.bucket_name = bucket_name
204-
self.s3_client = boto3.client(
205-
"s3",
206-
aws_access_key_id=aws_userid,
207-
aws_secret_access_key=aws_apikey,
208-
**(other_aws_credentials or {}),
209-
)
210-
211-
def _get_content_path(self, sha256: str) -> str:
212-
"""S3 key like 59/4c/67/<sha256>/"""
213-
return f"{sha256[:2]}/{sha256[2:4]}/{sha256[4:]}/"
214-
215-
def list(self):
216-
"""List all stored downloads."""
217-
downloads = []
218-
try:
219-
paginator = self.s3_client.get_paginator("list_objects_v2")
220-
for page in paginator.paginate(Bucket=self.bucket_name):
221-
for obj in page.get("Contents", []):
222-
key = obj["Key"]
223-
if key.endswith(".json"):
224-
try:
225-
response = self.s3_client.get_object(
226-
Bucket=self.bucket_name, Key=key
227-
)
228-
data = json.loads(response["Body"].read())
229-
downloads.append(Download(**data))
230-
except Exception as e:
231-
logger.error(f"Error reading S3 object {key}: {e}")
232-
except ClientError as e:
233-
logger.error(f"Failed to list S3 objects: {e}")
234-
return downloads
235-
236-
def get(self, sha256_checksum: str):
237-
"""Retrieve a Download object for the given SHA256 hash."""
238-
prefix = self._get_content_path(sha256_checksum)
239-
try:
240-
response = self.s3_client.list_objects_v2(
241-
Bucket=self.bucket_name, Prefix=prefix, MaxKeys=1
242-
)
243-
if "Contents" in response:
244-
key = response["Contents"][0]["Key"]
245-
obj_response = self.s3_client.get_object(
246-
Bucket=self.bucket_name, Key=key
247-
)
248-
data = json.loads(obj_response["Body"].read())
249-
return Download(**data)
250-
except ClientError as e:
251-
logger.error(f"Failed to get S3 object for {sha256_checksum}: {e}")
252-
return None
253-
254-
def put(self, content: bytes, download_url: str, download_date: str, filename: str):
255-
"""Store the content and its metadata."""
256-
sha256 = self._compute_sha256(content)
257-
content_key = self._get_content_path(sha256) + "content"
258-
try:
259-
self.s3_client.head_object(Bucket=self.bucket_name, Key=content_key)
260-
logger.info(f"Content already exists for {sha256}")
261-
except ClientError:
262-
try:
263-
self.s3_client.put_object(
264-
Bucket=self.bucket_name,
265-
Key=content_key,
266-
Body=content,
267-
)
268-
except ClientError as e:
269-
raise Exception(f"Failed to write content to S3 {content_key}: {e}")
270-
271-
origin_hash = self._compute_origin_hash(filename, download_date, download_url)
272-
origin_filename = f"origin-{origin_hash}.json"
273-
origin_key = self._get_content_path(sha256) + origin_filename
274-
275-
metadata = self._build_metadata(sha256, filename, download_date, download_url)
276-
metadata_json = json.dumps(metadata, indent=2).encode("utf-8")
277-
try:
278-
self.s3_client.put_object(
279-
Bucket=self.bucket_name,
280-
Key=origin_key,
281-
Body=metadata_json,
282-
)
283-
except ClientError as e:
284-
raise Exception(f"Failed to write metadata to S3 {origin_key}: {e}")
285-
286-
return Download(**metadata)
287-
288-
def find(
289-
self, download_url: str = None, filename: str = None, download_date: str = None
290-
):
291-
"""Find a download based on metadata."""
292-
if not (download_url or filename or download_date):
293-
return None
294-
try:
295-
paginator = self.s3_client.get_paginator("list_objects_v2")
296-
for page in paginator.paginate(Bucket=self.bucket_name):
297-
for obj in page.get("Contents", []):
298-
key = obj["Key"]
299-
if key.endswith(".json"):
300-
try:
301-
response = self.s3_client.get_object(
302-
Bucket=self.bucket_name, Key=key
303-
)
304-
data = json.loads(response["Body"].read())
305-
if (
306-
(
307-
download_url is None
308-
or data.get("url") == download_url
309-
)
310-
and (
311-
filename is None or data.get("filename") == filename
312-
)
313-
and (
314-
download_date is None
315-
or data.get("download_date") == download_date
316-
)
317-
):
318-
return Download(**data)
319-
except Exception as e:
320-
logger.error(f"Error reading S3 object {key}: {e}")
321-
except ClientError as e:
322-
logger.error(f"Failed to find in S3: {e}")
323-
return None
324-
325-
326-
class SftpProvider(DownloadStore):
327-
def __init__(self, host: str, root_path: str, ssh_credentials: dict):
328-
self.host = host
329-
self.root_path = Path(root_path)
330-
self.ssh_credentials = ssh_credentials
331-
self.ssh = paramiko.SSHClient()
332-
self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
333-
try:
334-
self.ssh.connect(
335-
hostname=host,
336-
username=ssh_credentials.get("username"),
337-
password=ssh_credentials.get("password"),
338-
)
339-
self.sftp = self.ssh.open_sftp()
340-
except SSHException as e:
341-
raise Exception(f"Failed to connect to SFTP server {host}: {e}")
342-
343-
def _get_content_path(self, sha256: str) -> str:
344-
"""SFTP path like 59/4c/67/<sha256>/"""
345-
return str(self.root_path / sha256[:2] / sha256[2:4] / sha256[4:])
346-
347-
def list(self):
348-
"""List all stored downloads."""
349-
downloads = []
350-
try:
351-
for root, _, files in self._sftp_walk(self.root_path):
352-
for filename in files:
353-
if filename.endswith(".json"):
354-
file_path = os.path.join(root, filename)
355-
try:
356-
with self.sftp.open(file_path, "r") as f:
357-
data = json.load(f)
358-
downloads.append(Download(**data))
359-
except Exception as e:
360-
logger.error(f"Error reading SFTP file {file_path}: {e}")
361-
except SSHException as e:
362-
logger.error(f"Failed to list SFTP files: {e}")
363-
return downloads
364-
365-
def _sftp_walk(self, path):
366-
"""Recursively walk SFTP directory."""
367-
path = str(path)
368-
for entry in self.sftp.listdir_attr(path):
369-
full_path = os.path.join(path, entry.filename)
370-
if stat.S_ISDIR(entry.st_mode):
371-
yield from self._sftp_walk(full_path)
372-
else:
373-
yield path, [], [entry.filename]
374-
375-
def get(self, sha256_checksum: str):
376-
"""Retrieve a Download object for the given SHA256 hash."""
377-
content_path = self._get_content_path(sha256_checksum)
378-
try:
379-
files = self.sftp.listdir(content_path)
380-
origin_files = [
381-
f for f in files if f.startswith("origin-") and f.endswith(".json")
382-
]
383-
if origin_files:
384-
with self.sftp.open(
385-
os.path.join(content_path, origin_files[0]), "r"
386-
) as f:
387-
data = json.load(f)
388-
return Download(**data)
389-
except SSHException as e:
390-
logger.error(f"Failed to get SFTP file for {sha256_checksum}: {e}")
391-
return None
392-
393-
def put(self, content: bytes, download_url: str, download_date: str, filename: str):
394-
"""Store the content and its metadata."""
395-
sha256 = self._compute_sha256(content)
396-
content_path = self._get_content_path(sha256)
397-
try:
398-
self.sftp.mkdir(content_path)
399-
except SSHException:
400-
pass
401-
402-
content_file = os.path.join(content_path, "content")
403-
try:
404-
self.sftp.stat(content_file)
405-
logger.info(f"Content already exists for {sha256}")
406-
except SSHException:
407-
try:
408-
with self.sftp.open(content_file, "wb") as f:
409-
f.write(content)
410-
except SSHException as e:
411-
raise Exception(f"Failed to write content to SFTP {content_file}: {e}")
412-
413-
origin_hash = self._compute_origin_hash(filename, download_date, download_url)
414-
origin_filename = f"origin-{origin_hash}.json"
415-
origin_path = os.path.join(content_path, origin_filename)
416-
try:
417-
self.sftp.stat(origin_path)
418-
raise Exception(f"Origin {origin_filename} already exists")
419-
except SSHException:
420-
metadata = self._build_metadata(
421-
sha256, filename, download_date, download_url
422-
)
423-
metadata_json = json.dumps(metadata, indent=2).encode("utf-8")
424-
try:
425-
with self.sftp.open(origin_path, "wb") as f:
426-
f.write(metadata_json)
427-
except SSHException as e:
428-
raise Exception(f"Failed to write metadata to SFTP {origin_path}: {e}")
429-
430-
return Download(**metadata)
431-
432-
def find(
433-
self, download_url: str = None, filename: str = None, download_date: str = None
434-
):
435-
"""Find a download based on metadata."""
436-
if not (download_url or filename or download_date):
437-
return None
438-
try:
439-
for root, _, files in self._sftp_walk(self.root_path):
440-
for filename in files:
441-
if filename.endswith(".json"):
442-
file_path = os.path.join(root, filename)
443-
try:
444-
with self.sftp.open(file_path, "r") as f:
445-
data = json.load(f)
446-
if (
447-
(
448-
download_url is None
449-
or data.get("url") == download_url
450-
)
451-
and (
452-
filename is None or data.get("filename") == filename
453-
)
454-
and (
455-
download_date is None
456-
or data.get("download_date") == download_date
457-
)
458-
):
459-
return Download(**data)
460-
except Exception as e:
461-
logger.error(f"Error reading SFTP file {file_path}: {e}")
462-
except SSHException as e:
463-
logger.error(f"Failed to find in SFTP: {e}")
464-
return None

scanpipe/pipelines/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@
3434
from pyinstrument import Profiler
3535

3636
from aboutcode.pipeline import BasePipeline
37-
from scanpipe.settings import ENABLE_DOWNLOAD_ARCHIVING
38-
from scanpipe.settings import download_store
37+
from scancodeio.settings import ENABLE_DOWNLOAD_ARCHIVING
38+
from scancodeio.settings import download_store
3939

4040
logger = logging.getLogger(__name__)
4141

scanpipe/pipes/input.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@
4444
from scanpipe.models import InputSource
4545
from scanpipe.pipes import scancode
4646
from scanpipe.pipes.output import mappings_key_by_fieldname
47-
from scanpipe.settings import ENABLE_DOWNLOAD_ARCHIVING
48-
from scanpipe.settings import download_store
47+
from scancodeio.settings import ENABLE_DOWNLOAD_ARCHIVING
48+
from scancodeio.settings import download_store
4949

5050
logger = logging.getLogger(__name__)
5151

scanpipe/tests/test_input.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from scanpipe.models import InputSource
3131
from scanpipe.pipes.input import add_input_from_upload
3232
from scanpipe.pipes.input import add_input_from_url
33-
from scanpipe.settings import download_store
33+
from scancodeio.settings import download_store
3434
from scanpipe.tests import make_project
3535

3636

0 commit comments

Comments
 (0)