Skip to content

Commit 0f57932

Browse files
committed
remove deprecated tasks.py code
1 parent a3bc1bd commit 0f57932

File tree

1 file changed

+10
-97
lines changed

1 file changed

+10
-97
lines changed

sde_collections/tasks.py

Lines changed: 10 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
from django.apps import apps
77
from django.conf import settings
88
from django.core import management
9+
from django.core.management.commands import loaddata
910
from django.db import IntegrityError
1011

1112
from config import celery_app
1213

1314
from .models.collection import Collection, WorkflowStatusChoices
14-
from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
15+
from .models.delta_url import DumpUrl
1516
from .sinequa_api import Api
1617
from .utils.github_helper import GitHubHandler
1718

@@ -63,89 +64,6 @@ def _get_data_to_import(collection, server_name):
6364
return data_to_import
6465

6566

66-
def _compare_and_populate_delta_urls(collection):
67-
"""Compare DumpUrl and CuratedUrl and populate DeltaUrl."""
68-
dump_urls = DumpUrl.objects.filter(collection=collection)
69-
curated_urls = CuratedUrl.objects.filter(collection=collection)
70-
71-
DeltaUrl.objects.filter(collection=collection).delete()
72-
73-
curated_urls_dict = {url.url: url for url in curated_urls}
74-
75-
# Iterate over Dump URLs to find deltas
76-
for dump_url in dump_urls:
77-
curated_url = curated_urls_dict.get(dump_url.url)
78-
79-
if not curated_url:
80-
# New URL found, add to DeltaUrl
81-
DeltaUrl.objects.create(
82-
collection=collection,
83-
url=dump_url.url,
84-
scraped_title=dump_url.scraped_title,
85-
generated_title=dump_url.generated_title,
86-
document_type=dump_url.document_type,
87-
division=dump_url.division,
88-
delete=False,
89-
)
90-
elif (
91-
curated_url.scraped_title != dump_url.scraped_title
92-
or curated_url.generated_title != dump_url.generated_title
93-
or curated_url.document_type != dump_url.document_type
94-
or curated_url.division != dump_url.division
95-
):
96-
# Metadata changed, add to DeltaUrl
97-
DeltaUrl.objects.create(
98-
collection=collection,
99-
url=dump_url.url,
100-
scraped_title=dump_url.scraped_title,
101-
generated_title=dump_url.generated_title,
102-
document_type=dump_url.document_type,
103-
division=dump_url.division,
104-
delete=False,
105-
)
106-
107-
# Mark any missing URLs in CuratedUrl as deleted in DeltaUrl
108-
dump_url_set = set(dump_urls.values_list("url", flat=True))
109-
for curated_url in curated_urls:
110-
if curated_url.url not in dump_url_set:
111-
DeltaUrl.objects.create(
112-
collection=collection,
113-
url=curated_url.url,
114-
scraped_title=curated_url.scraped_title,
115-
generated_title=curated_url.generated_title,
116-
document_type=curated_url.document_type,
117-
division=curated_url.division,
118-
delete=True,
119-
)
120-
121-
122-
# TODO: Bishwas wrote this but it is outdated.
123-
# def populate_dump_urls(collection):
124-
# urls = Url.objects.filter(collection=collection)
125-
126-
# for url_instance in urls:
127-
# try:
128-
# # Create DumpUrl by passing in the parent Url fields
129-
# dump_url_instance = DumpUrl(
130-
# id=url_instance.id,
131-
# collection=url_instance.collection,
132-
# url=url_instance.url,
133-
# scraped_title=url_instance.scraped_title,
134-
# visited=url_instance.visited,
135-
# document_type=url_instance.document_type,
136-
# division=url_instance.division,
137-
# )
138-
# dump_url_instance.save() # Save both Url and DumpUrl entries
139-
140-
# print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}")
141-
142-
# except Exception as e:
143-
# print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}")
144-
# continue
145-
146-
# print(f"Successfully populated DumpUrl model with {urls.count()} entries.")
147-
148-
14967
@celery_app.task(soft_time_limit=10000)
15068
def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
15169
TEMP_FOLDER_NAME = "temp"
@@ -160,31 +78,26 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
16078
data_to_import = _get_data_to_import(server_name=server_name, collection=collection)
16179
print(f"Got {len(data_to_import)} records for {collection.config_folder}")
16280

163-
print("Clearing DumpUrl model...")
164-
DumpUrl.objects.filter(collection=collection).delete()
165-
16681
print("Dumping django fixture to file")
16782
json.dump(data_to_import, open(urls_file, "w"))
16883

169-
print("Loading data into Url model using loaddata...")
170-
management.call_command("loaddata", urls_file)
84+
print("Deleting existing candidate URLs")
85+
# this sometimes takes a while
86+
collection.candidate_urls.all().delete()
17187

172-
# TODO: Bishwas wrote this but it is does not work.
173-
# print("Creating DumpUrl entries...")
174-
# populate_dump_urls(collection)
88+
print("Loading fixture; this may take a while")
89+
# subprocess.call(f'python manage.py loaddata "{urls_file}"', shell=True)
90+
management.call_command(loaddata.Command(), urls_file)
17591

17692
print("Applying existing patterns; this may take a while")
17793
collection.apply_all_patterns()
17894

179-
print("Comparing DumpUrl with CuratedUrl...")
180-
_compare_and_populate_delta_urls(collection)
181-
182-
if collection.workflow_status != WorkflowStatusChoices.ENGINEERING_IN_PROGRESS:
95+
if collection.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
18396
collection.workflow_status = WorkflowStatusChoices.ENGINEERING_IN_PROGRESS
18497
collection.save()
18598

18699
# Finally set the status to READY_FOR_CURATION
187-
# collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
100+
collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
188101
collection.save()
189102

190103
print("Deleting temp files")

0 commit comments

Comments
 (0)