Skip to content

Commit 1a3233c

Browse files
authored
Merge pull request #591 from aboutcode-org/index-time-matching
Index time matching
2 parents b257482 + 8c8b3e4 commit 1a3233c

File tree

4 files changed

+183
-0
lines changed

4 files changed

+183
-0
lines changed

minecode/indexing.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,95 @@ def index_package_files(package, scan_data, reindex=False):
119119
return scan_index_errors
120120

121121

122+
def update_package_relationships(package, existing_package):
123+
"""
124+
Update the relations of `existing_package` to point at `package`
125+
"""
126+
existing_package.approximatedirectorycontentindex_set.update(package=package)
127+
existing_package.approximatedirectorystructureindex_set.update(package=package)
128+
existing_package.approximateresourcecontentindex_set.update(package=package)
129+
existing_package.exactfileindex_set.update(package=package)
130+
existing_package.snippetindex_set.update(package=package)
131+
existing_package.stemmedsnippetindex_set.update(package=package)
132+
existing_package.resources.update(package=package)
133+
existing_package.is_duplicate = True
134+
existing_package.save()
135+
package.is_duplicate = False
136+
package.save()
137+
138+
139+
def check_for_duplicate_packages(package):
140+
"""
141+
Given a `package`, check to see if it has already been indexed already. If
142+
so, then check to see if `package` is a better candidate than the existing
143+
package for being the ultimate source for that package.
144+
145+
Return True if a duplicate package already exists and relations have been
146+
updated, otherwise return False.
147+
"""
148+
from packagedb.models import Package
149+
150+
if not package.sha1:
151+
return False
152+
153+
repo_types = [
154+
"apache",
155+
"bower",
156+
"composer",
157+
"cpan",
158+
"cran",
159+
"crate",
160+
"deb",
161+
"docker",
162+
"eclipse",
163+
"fdroid",
164+
"gem",
165+
"golang",
166+
"gstreamer",
167+
"maven",
168+
"npm",
169+
"nuget",
170+
"openwrt",
171+
"pypi",
172+
"rpm",
173+
]
174+
source_repo_types = [
175+
"bitbucket",
176+
"github",
177+
"gitlab",
178+
"googlecode",
179+
"sourceforge",
180+
]
181+
182+
# Check for dupes
183+
existing_packages = Package.objects.filter(sha1=package.sha1, is_duplicate=False)
184+
for existing_package in existing_packages:
185+
# see if the package we are indexing is older than the package we have
186+
# TODO: This will probably have to be a task
187+
if (
188+
(package.type in repo_types and existing_package.type not in repo_types)
189+
or (
190+
package.type in source_repo_types
191+
and existing_package.type not in source_repo_types
192+
)
193+
or (
194+
(existing_package.release_date and package.release_date)
195+
and (existing_package.release_date > package.release_date)
196+
)
197+
):
198+
update_package_relationships(
199+
package=package, existing_package=existing_package
200+
)
201+
202+
return bool(existing_packages)
203+
204+
122205
def index_package(
123206
scannable_uri, package, scan_data, summary_data, project_extra_data, reindex=False
124207
):
208+
if check_for_duplicate_packages(package):
209+
return
210+
125211
scan_index_errors = []
126212
try:
127213
indexing_errors = index_package_files(package, scan_data, reindex=reindex)

minecode/tests/test_indexing.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import json
1111
import os
12+
from datetime import datetime
1213

1314
from matchcode.models import ApproximateDirectoryContentIndex
1415
from matchcode.models import ApproximateDirectoryStructureIndex
@@ -184,3 +185,74 @@ def test_indexing_index_package_dwarf(self):
184185
extra_data = result.first().extra_data
185186
expected_extra_data = scan_data["files"][0]["extra_data"]
186187
self.assertEqual(expected_extra_data, extra_data)
188+
189+
def test_update_check_for_duplicate_packages(self):
190+
test_package1 = Package.objects.create(
191+
download_url="https://github.com//wagon-api/wagon-api-20040705.181715.jar",
192+
type="github",
193+
namespace="",
194+
name="wagon-api",
195+
version="20040705.181715",
196+
sha1="12345",
197+
)
198+
test_package2 = Package.objects.create(
199+
download_url="https://repo1.maven.org/wagon-api-20040705.181715.jar",
200+
type="maven",
201+
namespace="",
202+
name="wagon-api",
203+
version="20040705.181715",
204+
sha1="12345",
205+
)
206+
scan_data_loc = self.get_test_loc(
207+
"indexing/scancodeio_wagon-api-20040705.181715.json"
208+
)
209+
with open(scan_data_loc, "rb") as f:
210+
scan_data = json.loads(f.read())
211+
212+
# Test that resources
213+
indexing.index_package_files(test_package1, scan_data)
214+
indexing.update_package_relationships(
215+
package=test_package2, existing_package=test_package1
216+
)
217+
resources = Resource.objects.filter(package=test_package2)
218+
self.assertEqual(64, len(resources))
219+
resource_data = [r.to_dict() for r in resources]
220+
expected_resources_loc = self.get_test_loc(
221+
"indexing/scancodeio_wagon-api-20040705.181715-expected.json"
222+
)
223+
self.check_expected_results(
224+
resource_data, expected_resources_loc, regen=FIXTURES_REGEN
225+
)
226+
227+
def test_update_check_for_duplicate_packages_release_date(self):
228+
test_package1 = Package.objects.create(
229+
download_url="https://bitbucket.com//wagon-api/wagon-api-20040705.181715.jar",
230+
type="bitbucket",
231+
namespace="",
232+
name="wagon-api",
233+
version="20040705.181715",
234+
sha1="12345",
235+
release_date=datetime.now(),
236+
)
237+
test_package2 = Package.objects.create(
238+
download_url="https://github.com/wagon-api-20040705.181715.jar",
239+
type="github",
240+
namespace="",
241+
name="wagon-api",
242+
version="20040705.181715",
243+
sha1="12345",
244+
release_date=datetime.now(),
245+
)
246+
scan_data_loc = self.get_test_loc(
247+
"indexing/scancodeio_wagon-api-20040705.181715.json"
248+
)
249+
with open(scan_data_loc, "rb") as f:
250+
scan_data = json.loads(f.read())
251+
252+
# Test that resources are updated to use the older package
253+
indexing.index_package_files(test_package2, scan_data)
254+
indexing.update_package_relationships(
255+
package=test_package1, existing_package=test_package2
256+
)
257+
resources = Resource.objects.filter(package=test_package1)
258+
self.assertEqual(64, len(resources))
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Generated by Django 5.1.5 on 2025-03-12 21:44
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("packagedb", "0090_alter_packageactivity_uuid"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="package",
15+
name="is_duplicate",
16+
field=models.BooleanField(
17+
default=False,
18+
help_text="True if this Package is a duplicate of another Package",
19+
),
20+
),
21+
]

packagedb/models.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,10 @@ class Package(
554554
"A mapping containing a summary and license clarity score for this Package"
555555
),
556556
)
557+
is_duplicate = models.BooleanField(
558+
default=False,
559+
help_text=_("True if this Package is a duplicate of another Package"),
560+
)
557561

558562
objects = PackageQuerySet.as_manager()
559563

0 commit comments

Comments
 (0)