Skip to content

Commit 1ea5168

Browse files
authored
Merge pull request #1090 from NASA-IMPACT/1051-backend-model-changes-on-cosmos-to-hold-new-incoming-urls-frontend
1051 backend model changes on cosmos to hold new incoming urls frontend
2 parents 0f0d407 + 6112e9a commit 1ea5168

33 files changed

+3806
-508
lines changed

README.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ $ docker-compose -f local.yml run --rm django python manage.py loaddata sde_coll
8282
Navigate to the server running prod, then to the project folder. Run the following command to create a backup:
8383

8484
```bash
85-
docker-compose -f production.yml run --rm --user root django python manage.py dumpdata --natural-foreign --natural-primary --exclude=contenttypes --exclude=auth.Permission --indent 2 --output /app/backups/prod_backup-20240812.json
85+
docker-compose -f production.yml run --rm --user root django python manage.py dumpdata --natural-foreign --natural-primary --exclude=contenttypes --exclude=auth.Permission --indent 2 --output /app/backups/prod_backup-20241114.json
8686
```
8787
This will have saved the backup in a folder outside of the docker container. Now you can copy it to your local machine.
8888

@@ -208,3 +208,20 @@ Eventually, job creation will be done seamlessly by the webapp. Until then, edit
208208
- JavaScript: `/sde_indexing_helper/static/js`
209209
- CSS: `/sde_indexing_helper/static/css`
210210
- Images: `/sde_indexing_helper/static/images`
211+
212+
213+
## Running Long Scripts on the Server
214+
```shell
215+
tmux new -s docker_django
216+
```
217+
Once you are inside, you can run dmshell.
218+
219+
Later, you can do this to get back in.
220+
```shell
221+
tmux attach -t docker_django
222+
```
223+
224+
To delete the session:
225+
```shell
226+
tmux kill-session -t docker_django
227+
```

compose/production/traefik/traefik.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ certificatesResolvers:
3131
http:
3232
routers:
3333
web-secure-router:
34-
rule: "Host(`sde-indexing-helper.nasa-impact.net`)"
34+
rule: 'Host(`{{ env "TRAEFIK_DOMAIN" }}`)'
3535
entryPoints:
3636
- web-secure
3737
middlewares:
@@ -42,7 +42,7 @@ http:
4242
certResolver: letsencrypt
4343

4444
flower-secure-router:
45-
rule: "Host(`sde-indexing-helper.nasa-impact.net`)"
45+
rule: 'Host(`{{ env "TRAEFIK_DOMAIN" }}`)'
4646
entryPoints:
4747
- flower
4848
service: flower

config_generation/xmls/plugin_indexing_template.xml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,4 @@
268268
<Name>id</Name>
269269
<Value>doc.url1</Value>
270270
</Mapping>
271-
<Mapping>
272-
<Name>version</Name>
273-
<Value>Md5(doc.url1)</Value>
274-
</Mapping>
275271
</Sinequa>

production.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ services:
3838
- django
3939
volumes:
4040
- production_traefik:/etc/traefik/acme
41+
env_file:
42+
# this should contain TRAEFIK_DOMAIN=sde-indexing-helper-staging.nasa-impact.net or
43+
# TRAEFIK_DOMAIN=sde-indexing-helper.nasa-impact.net or
44+
- ./.envs/.production/.traefik
4145
ports:
4246
- "0.0.0.0:80:80"
4347
- "0.0.0.0:443:443"

requirements/base.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,11 @@ django-cors-headers==4.4.0
2525
django-filter==24.3
2626
djangorestframework-datatables==0.7.2
2727
djangorestframework==3.15.2
28+
factory-boy==3.3.0
2829
lxml==4.9.2
2930
PyGithub==2.2.0
31+
pytest-django==4.8.0
32+
pytest==8.0.0
3033
tqdm==4.66.3
3134
unidecode==1.3.8
3235
xmltodict==0.13.0
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from collections import defaultdict
2+
3+
from django.db import models
4+
5+
from sde_collections.models.candidate_url import CandidateURL
6+
from sde_collections.models.collection import Collection
7+
8+
# Get all field names except 'id' and 'collection' (since we're already looping by collection)
9+
duplicate_fields = [field.name for field in CandidateURL._meta.get_fields() if field.name not in ["id", "collection"]]
10+
11+
12+
def analyze_duplicates():
13+
"""Analyze duplicates and print how many would be deleted in each collection."""
14+
deletion_stats = defaultdict(lambda: {"total": 0, "to_delete": 0})
15+
16+
# Loop through each collection
17+
for collection in Collection.objects.all():
18+
# Count total URLs for the collection
19+
total_urls = CandidateURL.objects.filter(collection=collection).count()
20+
deletion_stats[collection.config_folder]["total"] = total_urls
21+
22+
# Group CandidateURL instances by all fields dynamically
23+
duplicates_in_collection = (
24+
CandidateURL.objects.filter(collection=collection)
25+
.values(*duplicate_fields)
26+
.annotate(count=models.Count("id"))
27+
.filter(count__gt=1)
28+
)
29+
30+
# Count potential deletions without deleting
31+
for entry in duplicates_in_collection:
32+
duplicates_count = CandidateURL.objects.filter(
33+
collection=collection, **{field: entry[field] for field in duplicate_fields}
34+
).count()
35+
deletion_stats[collection.config_folder]["to_delete"] += duplicates_count - 1
36+
37+
# Print analysis results
38+
print("Duplicate analysis completed.")
39+
for config_folder, stats in deletion_stats.items():
40+
print(f"{config_folder}' has {stats['total']} total URL(s), with {stats['to_delete']} duplicates.")
41+
42+
43+
def delete_duplicates():
44+
"""Delete duplicates based on previously analyzed duplicates."""
45+
deletion_stats = defaultdict(int)
46+
47+
# Loop through each collection
48+
for collection in Collection.objects.all():
49+
# Group CandidateURL instances by all fields dynamically
50+
duplicates_in_collection = (
51+
CandidateURL.objects.filter(collection=collection)
52+
.values(*duplicate_fields)
53+
.annotate(count=models.Count("id"))
54+
.filter(count__gt=1)
55+
)
56+
57+
# Delete duplicates and track deletions
58+
for entry in duplicates_in_collection:
59+
duplicates = CandidateURL.objects.filter(
60+
collection=collection, **{field: entry[field] for field in duplicate_fields}
61+
)
62+
63+
# Keep the first instance and delete the rest
64+
for candidate in duplicates[1:]: # Skip the first to retain it
65+
candidate.delete()
66+
deletion_stats[collection.config_folder] += 1
67+
68+
# Print deletion results
69+
print("Duplicate URL cleanup completed.")
70+
for config_folder, deleted_count in deletion_stats.items():
71+
print(f"Collection '{config_folder}' had {deleted_count} duplicate URL(s) deleted.")
72+
73+
74+
# Usage
75+
analyze_duplicates() # First analyze duplicates
76+
delete_duplicates() # Then delete duplicates based on analysis

sde_collections/admin.py

Lines changed: 69 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,32 @@
33
from django.contrib import admin, messages
44
from django.http import HttpResponse
55

6+
from sde_collections.models.delta_patterns import (
7+
DeltaDivisionPattern,
8+
DeltaTitlePattern,
9+
)
10+
611
from .models.candidate_url import CandidateURL, ResolvedTitle
712
from .models.collection import Collection, WorkflowHistory
13+
from .models.delta_url import CuratedUrl, DeltaResolvedTitle, DeltaUrl, DumpUrl
814
from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
9-
from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api
15+
from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api
1016

1117

12-
def fetch_and_update_text_for_server(modeladmin, request, queryset, server_name):
18+
def fetch_and_replace_text_for_server(modeladmin, request, queryset, server_name):
1319
for collection in queryset:
14-
fetch_and_update_full_text.delay(collection.id, server_name)
20+
fetch_and_replace_full_text.delay(collection.id, server_name)
1521
modeladmin.message_user(request, f"Started importing URLs from {server_name.upper()} Server")
1622

1723

1824
@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
1925
def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
20-
fetch_and_update_text_for_server(modeladmin, request, queryset, "lrm_dev")
26+
fetch_and_replace_text_for_server(modeladmin, request, queryset, "lrm_dev")
2127

2228

2329
@admin.action(description="Import candidate URLs from XLI Server with Full Text")
24-
def fetch_full_text_lis_action(modeladmin, request, queryset):
25-
fetch_and_update_text_for_server(modeladmin, request, queryset, "xli")
30+
def fetch_full_text_xli_action(modeladmin, request, queryset):
31+
fetch_and_replace_text_for_server(modeladmin, request, queryset, "xli")
2632

2733

2834
@admin.action(description="Generate deployment message")
@@ -234,6 +240,8 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
234240
list_display = (
235241
"name",
236242
"candidate_urls_count",
243+
"delta_urls_count",
244+
"included_curated_urls_count",
237245
"config_folder",
238246
"url",
239247
"division",
@@ -248,15 +256,8 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
248256
"export_as_csv",
249257
"update_config",
250258
download_candidate_urls_as_csv,
251-
import_candidate_urls_test,
252-
import_candidate_urls_production,
253-
import_candidate_urls_secret_test,
254-
import_candidate_urls_secret_production,
255-
import_candidate_urls_xli_server,
256-
import_candidate_urls_lrm_dev_server,
257-
import_candidate_urls_lrm_qa_server,
258259
fetch_full_text_lrm_dev_action,
259-
fetch_full_text_lis_action,
260+
fetch_full_text_xli_action,
260261
]
261262
ordering = ("cleaning_order",)
262263

@@ -317,9 +318,63 @@ class DivisionPatternAdmin(admin.ModelAdmin):
317318
search_fields = ("match_pattern", "division")
318319

319320

321+
# deltas below
322+
class DeltaTitlePatternAdmin(admin.ModelAdmin):
323+
"""Admin View for DeltaTitlePattern"""
324+
325+
list_display = (
326+
"match_pattern",
327+
"title_pattern",
328+
"collection",
329+
"match_pattern_type",
330+
)
331+
list_filter = (
332+
"match_pattern_type",
333+
"collection",
334+
)
335+
336+
337+
class DeltaResolvedTitleAdmin(admin.ModelAdmin):
338+
list_display = ["title_pattern", "delta_url", "resolved_title", "created_at"]
339+
340+
341+
class DeltaDivisionPatternAdmin(admin.ModelAdmin):
342+
list_display = ("collection", "match_pattern", "division")
343+
search_fields = ("match_pattern", "division")
344+
345+
346+
class DumpUrlAdmin(admin.ModelAdmin):
347+
"""Admin View for DumpUrl"""
348+
349+
list_display = ("url", "scraped_title", "collection")
350+
list_filter = ("collection",)
351+
352+
353+
class DeltaUrlAdmin(admin.ModelAdmin):
354+
"""Admin View for DeltaUrl"""
355+
356+
list_display = ("url", "scraped_title", "generated_title", "collection")
357+
list_filter = ("collection",)
358+
359+
360+
class CuratedUrlAdmin(admin.ModelAdmin):
361+
"""Admin View for CuratedUrl"""
362+
363+
list_display = ("url", "scraped_title", "generated_title", "collection")
364+
list_filter = ("collection",)
365+
366+
320367
admin.site.register(WorkflowHistory, WorkflowHistoryAdmin)
321368
admin.site.register(CandidateURL, CandidateURLAdmin)
322369
admin.site.register(TitlePattern, TitlePatternAdmin)
323370
admin.site.register(IncludePattern)
324371
admin.site.register(ResolvedTitle, ResolvedTitleAdmin)
325372
admin.site.register(DivisionPattern, DivisionPatternAdmin)
373+
374+
375+
admin.site.register(DeltaTitlePattern, DeltaTitlePatternAdmin)
376+
admin.site.register(DeltaResolvedTitle, DeltaResolvedTitleAdmin)
377+
admin.site.register(DeltaDivisionPattern, DeltaDivisionPatternAdmin)
378+
admin.site.register(DumpUrl, DumpUrlAdmin)
379+
admin.site.register(DeltaUrl, DeltaUrlAdmin)
380+
admin.site.register(CuratedUrl, CuratedUrlAdmin)

0 commit comments

Comments
 (0)