Skip to content

Commit 212738a

Browse files
committed
Refine update_from_purldb merging data when multiple PURLDB entries #303
Signed-off-by: tdruez <[email protected]>
1 parent 93ab123 commit 212738a

File tree

4 files changed

+97
-5
lines changed

4 files changed

+97
-5
lines changed

component_catalog/models.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
from dje.models import ReferenceNotesMixin
7474
from dje.tasks import logger as tasks_logger
7575
from dje.utils import is_purl_str
76+
from dje.utils import merge_common_non_empty_values
7677
from dje.utils import set_fields_from_object
7778
from dje.validators import generic_uri_validator
7879
from dje.validators import validate_url_segment
@@ -2482,14 +2483,23 @@ def get_purldb_entries(self, user, max_request_call=0, timeout=10):
24822483

24832484
def update_from_purldb(self, user):
24842485
"""
2485-
Find this Package in the PurlDB and update empty fields with PurlDB data
2486-
when available.
2486+
Update this Package instance with data from PurlDB.
2487+
2488+
- Retrieves matching entries from PurlDB using the given user.
2489+
- If exactly one match is found, its data is used directly.
2490+
- If multiple entries are found, only values that are non-empty and
2491+
common across all entries are merged and used to update the Package.
24872492
"""
24882493
purldb_entries = self.get_purldb_entries(user)
24892494
if not purldb_entries:
24902495
return
24912496

2492-
package_data = purldb_entries[0]
2497+
purldb_entries_count = len(purldb_entries)
2498+
if purldb_entries_count == 1:
2499+
package_data = purldb_entries[0]
2500+
else:
2501+
package_data = merge_common_non_empty_values(purldb_entries)
2502+
24932503
# The format from PURLDB is "2019-11-18T00:00:00Z"
24942504
if release_date := package_data.get("release_date"):
24952505
package_data["release_date"] = release_date.split("T")[0]

component_catalog/tests/test_models.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2598,9 +2598,9 @@ def test_package_model_update_from_purldb(self, mock_get_purldb_entries):
25982598
}
25992599

26002600
mock_get_purldb_entries.return_value = [purldb_entry]
2601-
package1 = Package.objects.create(
2601+
package1 = make_package(
2602+
self.dataspace,
26022603
filename="package",
2603-
dataspace=self.dataspace,
26042604
# "unknown" values are overrided
26052605
declared_license_expression="unknown",
26062606
)
@@ -2628,6 +2628,38 @@ def test_package_model_update_from_purldb(self, mock_get_purldb_entries):
26282628
for field_name in updated_fields:
26292629
self.assertEqual(purldb_entry[field_name], getattr(package1, field_name))
26302630

2631+
@mock.patch("component_catalog.models.Package.get_purldb_entries")
2632+
def test_package_model_update_from_purldb_multiple_entries(self, mock_get_purldb_entries):
2633+
purldb_entry1 = {
2634+
"uuid": "326aa7a8-4f28-406d-89f9-c1404916925b",
2635+
"purl": "pkg:pypi/[email protected]",
2636+
"type": "pypi",
2637+
"name": "django",
2638+
"version": "3.0",
2639+
"keywords": ["Keyword1", "Keyword2"],
2640+
"filename": "Django-3.0.tar.gz",
2641+
"download_url": "https://files.pythonhosted.org/packages/38/Django-3.0.tar.gz",
2642+
}
2643+
purldb_entry2 = {
2644+
"uuid": "e133e70b-8dd3-4cf1-9711-72b1f57523a0",
2645+
"purl": "pkg:pypi/[email protected]",
2646+
"type": "pypi",
2647+
"name": "django",
2648+
"version": "3.0",
2649+
"primary_language": "Python",
2650+
"keywords": ["Keyword1", "Keyword2"],
2651+
"download_url": "https://another.url/Django-3.0.tar.gz",
2652+
}
2653+
2654+
mock_get_purldb_entries.return_value = [purldb_entry1, purldb_entry2]
2655+
package1 = make_package(self.dataspace, package_url="pkg:pypi/[email protected]")
2656+
updated_fields = package1.update_from_purldb(self.user)
2657+
expected = ["filename", "keywords", "primary_language"]
2658+
self.assertEqual(expected, sorted(updated_fields))
2659+
self.assertEqual("Django-3.0.tar.gz", package1.filename)
2660+
self.assertEqual(["Keyword1", "Keyword2"], package1.keywords)
2661+
self.assertEqual("Python", package1.primary_language)
2662+
26312663
@mock.patch("component_catalog.models.Package.get_purldb_entries")
26322664
def test_package_model_update_from_purldb_duplicate_exception(self, mock_get_purldb_entries):
26332665
package_url = "pkg:pypi/[email protected]"

dje/tests/test_utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from dje.utils import is_purl_fragment
3939
from dje.utils import is_purl_str
4040
from dje.utils import localized_datetime
41+
from dje.utils import merge_common_non_empty_values
4142
from dje.utils import merge_relations
4243
from dje.utils import normalize_newlines_as_CR_plus_LF
4344
from dje.utils import remove_field_from_query_dict
@@ -539,3 +540,32 @@ def test_utils_localized_datetime(self):
539540
self.assertEqual("Jan 13, 2025, 11:11 AM PST", localized_datetime(dt))
540541
dt = "2025-01-13T19:11:08.216188+01:00"
541542
self.assertEqual("Jan 13, 2025, 10:11 AM PST", localized_datetime(dt))
543+
544+
def test_utils_merge_common_non_empty_values(self):
545+
entry1 = {
546+
"name": "django",
547+
"version": "3.0",
548+
"description": "A web framework", # present
549+
"uuid": "1234", # different
550+
"empty_field": "", # empty
551+
"missing_in_other": "value",
552+
"keywords": ["Keyword1", "Keyword2"],
553+
"parties": ["a", "b"],
554+
}
555+
entry2 = {
556+
"name": "django", # same
557+
"version": "3.0", # same
558+
"description": "", # empty
559+
"uuid": "5678", # different → excluded
560+
# "missing_in_other" is missing
561+
"keywords": ["Keyword1"],
562+
"parties": ["a", "b"],
563+
}
564+
expected = {
565+
"name": "django",
566+
"version": "3.0",
567+
"description": "A web framework",
568+
"missing_in_other": "value",
569+
"parties": ["a", "b"],
570+
}
571+
self.assertEqual(expected, merge_common_non_empty_values([entry1, entry2]))

dje/utils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -699,3 +699,23 @@ def localized_datetime(datetime):
699699

700700
default_format = get_format("DATETIME_FORMAT")
701701
return date_format(dt, default_format)
702+
703+
704+
def merge_common_non_empty_values(dicts):
705+
"""
706+
Merge a list of dictionaries by extracting only the key-value pairs
707+
that are common and non-empty across all dictionaries.
708+
Missing keys are treated as empty values.
709+
"""
710+
merged_result = {}
711+
# Collect all unique keys from all dictionaries
712+
all_keys = set().union(*dicts)
713+
714+
for key in all_keys:
715+
values = [value for entry in dicts if (value := entry.get(key)) not in EMPTY_VALUES]
716+
717+
# Include key only if all values are identical
718+
if values and all(value == values[0] for value in values):
719+
merged_result[key] = values[0]
720+
721+
return merged_result

0 commit comments

Comments
 (0)