Skip to content

Commit e4900f1

Browse files
Update debian purl2meta to scan only archives
Refactor code to not save package metadata scan results as archives, instead scan metadata to be used as base package data and then scan and store package records for binary/source archives. Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent a5a7ba0 commit e4900f1

File tree

2 files changed

+45
-71
lines changed

2 files changed

+45
-71
lines changed

minecode/utils.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -238,28 +238,30 @@ def get_http_response(uri, timeout=10):
238238
return response
239239

240240

241-
def get_package_sha1(package):
241+
def get_package_sha1(package, field="repository_download_url"):
242242
"""
243243
Return the sha1 value for `package` by checking if the sha1 file exists for
244244
`package` on maven and returning the contents if it does.
245245
246246
If the sha1 is invalid, we download the package's JAR and calculate the sha1
247247
from that.
248248
"""
249-
download_url = package.repository_download_url
249+
download_url = getattr(package, field)
250250
sha1_download_url = f'{download_url}.sha1'
251251
response = requests.get(sha1_download_url)
252+
sha1 = None
252253
if response.ok:
253254
sha1_contents = response.text.strip().split()
254255
sha1 = sha1_contents[0]
255256
sha1 = validate_sha1(sha1)
256-
if not sha1:
257-
# Download JAR and calculate sha1 if we cannot get it from the repo
258-
response = requests.get(download_url)
259-
if response:
260-
sha1_hash = hashlib.new('sha1', response.content)
261-
sha1 = sha1_hash.hexdigest()
262-
return sha1
257+
258+
if not sha1:
259+
# Download JAR and calculate sha1 if we cannot get it from the repo
260+
response = requests.get(download_url)
261+
if response:
262+
sha1_hash = hashlib.new('sha1', response.content)
263+
sha1 = sha1_hash.hexdigest()
264+
return sha1
263265

264266

265267
def validate_sha1(sha1):

minecode/visitors/debian.py

Lines changed: 34 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
"""
5151

5252

53-
DEBIAN_BASE_URL = "http://deb.debian.org/debian/pool/main/"
53+
DEBIAN_BASE_URL = "https://deb.debian.org/debian/pool/main/"
5454
DEBIAN_METADATA_URL = "https://metadata.ftp-master.debian.org/changelogs/main/"
5555
# Other URLs and sources to consider
5656
# 'http://ftp.debian.org/debian/'
@@ -328,19 +328,20 @@ def process_request(purl_str, **kwargs):
328328
329329
Return an error string for errors that occur, or empty string if there is no error.
330330
"""
331-
source_package_url = kwargs.get("source_purl", None)
331+
source_purl = kwargs.get("source_purl", None)
332332
try:
333333
package_url = PackageURL.from_string(purl_str)
334-
334+
source_package_url = PackageURL.from_string(source_purl)
335+
335336
except ValueError as e:
336-
error = f'error occured when parsing {purl_str}: {e}'
337+
error = f'error occured when parsing purl: {purl_str} source_purl: {source_purl} : {e}'
337338
return error
338339

339340
has_version = bool(package_url.version)
340341
if has_version:
341342
error = map_debian_metadata_binary_and_source(
342343
package_url=package_url,
343-
source_package_url=source_package_url
344+
source_package_url=source_package_url,
344345
)
345346

346347
return error
@@ -358,11 +359,10 @@ def map_debian_package(debian_package, package_content):
358359
db_package = None
359360
error = ''
360361

361-
if package_content == PackageContentType.BINARY:
362-
purl = debian_package.package_url
362+
purl = debian_package.package_url
363+
if package_content == PackageContentType.BINARY:
363364
download_url = debian_package.binary_archive_url
364365
elif package_content == PackageContentType.SOURCE_ARCHIVE:
365-
purl = debian_package.source_package_url
366366
download_url = debian_package.source_archive_url
367367

368368
response = requests.get(download_url)
@@ -372,26 +372,33 @@ def map_debian_package(debian_package, package_content):
372372
logger.error(msg)
373373
return db_package, error
374374

375-
package = PackageData(
375+
purl_package = PackageData(
376376
type=purl.type,
377377
namespace=purl.namespace,
378378
name=purl.name,
379379
version=purl.version,
380380
qualifiers=purl.qualifiers,
381-
download_url=download_url,
382381
)
383382

383+
package, error_metadata = get_debian_package_metadata(debian_package)
384+
if error_metadata:
385+
error += error_metadata
386+
package.update_purl_fields(package_data=purl_package, replace=True)
387+
388+
# This will be used to download and scan the package
389+
package.download_url = download_url
390+
384391
# Set package_content value
385392
package.extra_data['package_content'] = package_content
386393

387394
# If sha1 exists for an archive, we know we can create the package
388395
# Use purl info as base and create packages for binary and source package
389-
sha1 = get_package_sha1(package)
396+
sha1 = get_package_sha1(package=package, field="download_url")
390397
if sha1:
391398
package.sha1 = sha1
392399
db_package, _, _, _ = merge_or_create_package(package, visit_level=50)
393400
else:
394-
msg = f'Failed to retrieve JAR: {purl.to_string()} from url: {download_url}'
401+
msg = f'Failed to retrieve package archive: {purl.to_string()} from url: {download_url}'
395402
error += msg + '\n'
396403
logger.error(msg)
397404

@@ -402,13 +409,9 @@ def map_debian_package(debian_package, package_content):
402409
return db_package, error
403410

404411

405-
def map_debian_package_metadata(debian_package, package_content):
412+
def get_debian_package_metadata(debian_package):
406413
"""
407414
"""
408-
from minecode.model_utils import add_package_to_scan_queue
409-
from minecode.model_utils import merge_or_create_package
410-
411-
db_package = None
412415
error = ''
413416

414417
metadata_url = debian_package.package_metadata_url
@@ -417,43 +420,25 @@ def map_debian_package_metadata(debian_package, package_content):
417420
msg = f'Package metadata not exist on debian: {metadata_url}'
418421
error += msg + '\n'
419422
logger.error(msg)
420-
return db_package, error
421-
423+
return None, error
424+
422425
metadata_content = response.text
423426
filename = metadata_url.split("/")[-1]
424427
file_name, _, extension = filename.rpartition(".")
425-
temp_metadata_file = get_temp_file(file_name=file_name, extension=extension)
426-
427-
package = DebianDscFileHandler.parse(location=temp_metadata_file)
428+
temp_metadata_file = get_temp_file(file_name=file_name, extension=extension)
429+
with open(temp_metadata_file, 'a') as metadata_file:
430+
metadata_file.write(metadata_content)
428431

432+
packages = DebianDscFileHandler.parse(location=temp_metadata_file)
433+
package = list(packages).pop()
429434
# In the case of looking up a maven package with qualifiers of
430435
# `classifiers=sources`, the purl of the package created from the pom does
431436
# not have the qualifiers, so we need to set them. Additionally, the download
432437
# url is not properly generated since it would be missing the sources bit
433438
# from the filename.
434439
package.qualifiers = debian_package.package_url.qualifiers
435440

436-
# Set package_content value
437-
package.extra_data['package_content'] = package_content
438-
439-
# If sha1 exists for a jar, we know we can create the package
440-
# Use pom info as base and create packages for binary and source package
441-
442-
# Check to see if binary is available
443-
sha1 = get_package_sha1(package)
444-
if sha1:
445-
package.sha1 = sha1
446-
db_package, _, _, _ = merge_or_create_package(package, visit_level=50)
447-
else:
448-
msg = f'Failed to retrieve JAR: {debian_package.package_url}'
449-
error += msg + '\n'
450-
logger.error(msg)
451-
452-
# Submit package for scanning
453-
if db_package:
454-
add_package_to_scan_queue(db_package)
455-
456-
return db_package, error
441+
return package, error
457442

458443

459444
def map_debian_metadata_binary_and_source(package_url, source_package_url):
@@ -463,6 +448,8 @@ def map_debian_metadata_binary_and_source(package_url, source_package_url):
463448
464449
Return an error string for errors that occur, or empty string if there is no error.
465450
"""
451+
error = ''
452+
466453
if "repository_url" in package_url.qualifiers:
467454
base_url = package_url.qualifiers["repository_url"]
468455
else:
@@ -480,14 +467,6 @@ def map_debian_metadata_binary_and_source(package_url, source_package_url):
480467
metadata_base_url=metadata_base_url,
481468
)
482469

483-
error = ''
484-
metadata_package, emsg = map_debian_package_metadata(
485-
debian_package,
486-
PackageContentType.METADATA,
487-
)
488-
if emsg:
489-
error += emsg
490-
491470
binary_package, emsg = map_debian_package(
492471
debian_package,
493472
PackageContentType.BINARY,
@@ -503,16 +482,9 @@ def map_debian_metadata_binary_and_source(package_url, source_package_url):
503482
if emsg:
504483
error += emsg
505484

506-
if metadata_package and binary_package:
507-
make_relationship(
508-
from_package=metadata_package,
509-
to_package=binary_package,
510-
relationship=PackageRelation.Relationship.BINARY_PACKAGE,
511-
)
512-
513-
if metadata_package and source_package:
485+
if binary_package and source_package:
514486
make_relationship(
515-
from_package=metadata_package,
487+
from_package=binary_package,
516488
to_package=source_package,
517489
relationship=PackageRelation.Relationship.SOURCE_PACKAGE,
518490
)
@@ -527,8 +499,8 @@ class DebianPackage:
527499
metadata_base_url = attr.ib(type=str)
528500
package_url = attr.ib(type=str)
529501
source_package_url = attr.ib(type=str)
530-
metadata_directory_url = attr.ib(type=str)
531-
archive_directory_url = attr.ib(type=str)
502+
metadata_directory_url = attr.ib(type=str, default=None)
503+
archive_directory_url = attr.ib(type=str, default=None)
532504

533505
def __attrs_post_init__(self, *args, **kwargs):
534506
self.set_debian_archive_directory()

0 commit comments

Comments
 (0)