Skip to content

Commit 8941b73

Browse files
committed
Create function to check for existing duplicates
Signed-off-by: Jono Yang <[email protected]>
1 parent 17ba976 commit 8941b73

File tree

3 files changed

+105
-21
lines changed

3 files changed

+105
-21
lines changed

minecode/indexing.py

Lines changed: 65 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -119,29 +119,74 @@ def index_package_files(package, scan_data, reindex=False):
119119
return scan_index_errors
120120

121121

122-
def index_package(
123-
scannable_uri, package, scan_data, summary_data, project_extra_data, reindex=False
124-
):
122+
def update_package_relationships(package, existing_package):
123+
"""
124+
Update the relations of `existing_package` to point at `package`
125+
"""
126+
existing_package.approximatedirectorycontentindex_set.update(package=package)
127+
existing_package.approximatedirectorystructureindex_set.update(package=package)
128+
existing_package.approximateresourcecontentindex_set.update(package=package)
129+
existing_package.exactfileindex_set.update(package=package)
130+
existing_package.snippetindex_set.update(package=package)
131+
existing_package.stemmedsnippetindex_set.update(package=package)
132+
existing_package.resources.update(package=package)
133+
existing_package.is_duplicate = True
134+
existing_package.save()
135+
package.is_duplicate = False
136+
package.save()
137+
138+
139+
def check_for_duplicate_packages(package):
140+
"""
141+
Given a `package`, check to see if it has already been indexed already. If
142+
so, then check to see if `package` is a better candidate than the existing
143+
package for being the ultimate source for that package.
144+
145+
Return True if a duplicate package already exists and relations have been
146+
updated, otherwise return False.
147+
"""
125148
from packagedb.models import Package
126149

150+
if not package.sha1:
151+
return False
152+
153+
repo_types = [
154+
"maven",
155+
"pypi",
156+
"npm",
157+
"crate",
158+
]
159+
git_repo_types = [
160+
"github",
161+
"gitlab",
162+
"bitbucket",
163+
]
164+
127165
# Check for dupes
128-
existing_packages = Package.objects.filter(sha1=package.sha1)
129-
if existing_packages:
130-
for existing_package in existing_packages:
131-
# see if the package we are indexing is older than the package we have
132-
if existing_package.package.release_date > package.release_date:
133-
existing_package.approximatedirectorycontentindex_set.update(package=package)
134-
existing_package.approximatedirectorystructureindex_set.update(package=package)
135-
existing_package.approximateresourcecontentindex_set.update(package=package)
136-
existing_package.exactfileindex_set.update(package=package)
137-
existing_package.snippetindex_set.update(package=package)
138-
existing_package.stemmedsnippetindex_set.update(package=package)
139-
existing_package.resources.update(package=package)
140-
existing_package.is_duplicate = True
141-
existing_package.save()
142-
package.is_duplicate = False
143-
package.save()
144-
return
166+
existing_packages = Package.objects.filter(sha1=package.sha1, is_duplicate=False)
167+
for existing_package in existing_packages:
168+
# see if the package we are indexing is older than the package we have
169+
# TODO: This will probably have to be a task
170+
if (
171+
(package.type in repo_types and existing_package.type not in repo_types)
172+
or (package.type in git_repo_types and package.type not in git_repo_types)
173+
or (
174+
(existing_package.release_date and package.release_date)
175+
and (existing_package.release_date > package.release_date)
176+
)
177+
):
178+
update_package_relationships(
179+
package=package, existing_package=existing_package
180+
)
181+
182+
return bool(package.sha1) and bool(existing_packages)
183+
184+
185+
def index_package(
186+
scannable_uri, package, scan_data, summary_data, project_extra_data, reindex=False
187+
):
188+
if check_for_duplicate_packages(package):
189+
return
145190

146191
scan_index_errors = []
147192
try:

minecode/tests/test_indexing.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,41 @@ def test_indexing_index_package_dwarf(self):
184184
extra_data = result.first().extra_data
185185
expected_extra_data = scan_data["files"][0]["extra_data"]
186186
self.assertEqual(expected_extra_data, extra_data)
187+
188+
def test_update_package_relationships(self):
189+
test_package1 = Package.objects.create(
190+
download_url="https://github.com//wagon-api/wagon-api-20040705.181715.jar",
191+
type="github",
192+
namespace="",
193+
name="wagon-api",
194+
version="20040705.181715",
195+
sha1="12345",
196+
)
197+
test_package2 = Package.objects.create(
198+
download_url="https://repo1.maven.org/wagon-api-20040705.181715.jar",
199+
type="maven",
200+
namespace="",
201+
name="wagon-api",
202+
version="20040705.181715",
203+
sha1="12345",
204+
)
205+
scan_data_loc = self.get_test_loc(
206+
"indexing/scancodeio_wagon-api-20040705.181715.json"
207+
)
208+
with open(scan_data_loc, "rb") as f:
209+
scan_data = json.loads(f.read())
210+
211+
indexing_errors = indexing.index_package_files(test_package1, scan_data)
212+
indexing.update_package_relationships(
213+
package=test_package2, existing_package=test_package1
214+
)
215+
216+
resources = Resource.objects.filter(package=test_package2)
217+
self.assertEqual(64, len(resources))
218+
resource_data = [r.to_dict() for r in resources]
219+
expected_resources_loc = self.get_test_loc(
220+
"indexing/scancodeio_wagon-api-20040705.181715-expected.json"
221+
)
222+
self.check_expected_results(
223+
resource_data, expected_resources_loc, regen=FIXTURES_REGEN
224+
)

packagedb/models.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,8 @@ class Package(
555555
),
556556
)
557557
is_duplicate = models.BooleanField(
558-
default=False, help_text=_("True if this Package is a duplicate of another Package")
558+
default=False,
559+
help_text=_("True if this Package is a duplicate of another Package"),
559560
)
560561

561562
objects = PackageQuerySet.as_manager()

0 commit comments

Comments
 (0)