Skip to content

Commit 77a2496

Browse files
committed
change full_text task to point to DumpUrl
1 parent 2235811 commit 77a2496

File tree

1 file changed

+26
-25
lines changed

1 file changed

+26
-25
lines changed

sde_collections/tasks.py

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from django.core import management
99

1010
from config import celery_app
11-
from sde_collections.models.candidate_url import CandidateURL
1211

1312
from .models.collection import Collection, WorkflowStatusChoices
1413
from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
@@ -119,30 +118,31 @@ def _compare_and_populate_delta_urls(collection):
119118
)
120119

121120

122-
def populate_dump_urls(collection):
123-
urls = Url.objects.filter(collection=collection)
124-
125-
for url_instance in urls:
126-
try:
127-
# Create DumpUrl by passing in the parent Url fields
128-
dump_url_instance = DumpUrl(
129-
id=url_instance.id,
130-
collection=url_instance.collection,
131-
url=url_instance.url,
132-
scraped_title=url_instance.scraped_title,
133-
visited=url_instance.visited,
134-
document_type=url_instance.document_type,
135-
division=url_instance.division,
136-
)
137-
dump_url_instance.save() # Save both Url and DumpUrl entries
121+
# TODO: Bishwas wrote this but it is outdated.
122+
# def populate_dump_urls(collection):
123+
# urls = Url.objects.filter(collection=collection)
138124

139-
print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}")
125+
# for url_instance in urls:
126+
# try:
127+
# # Create DumpUrl by passing in the parent Url fields
128+
# dump_url_instance = DumpUrl(
129+
# id=url_instance.id,
130+
# collection=url_instance.collection,
131+
# url=url_instance.url,
132+
# scraped_title=url_instance.scraped_title,
133+
# visited=url_instance.visited,
134+
# document_type=url_instance.document_type,
135+
# division=url_instance.division,
136+
# )
137+
# dump_url_instance.save() # Save both Url and DumpUrl entries
140138

141-
except Exception as e:
142-
print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}")
143-
continue
139+
# print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}")
140+
141+
# except Exception as e:
142+
# print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}")
143+
# continue
144144

145-
print(f"Successfully populated DumpUrl model with {urls.count()} entries.")
145+
# print(f"Successfully populated DumpUrl model with {urls.count()} entries.")
146146

147147

148148
@celery_app.task(soft_time_limit=10000)
@@ -168,8 +168,9 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
168168
print("Loading data into Url model using loaddata...")
169169
management.call_command("loaddata", urls_file)
170170

171-
print("Creating DumpUrl entries...")
172-
populate_dump_urls(collection)
171+
# TODO: Bishwas wrote this but it is does not work.
172+
# print("Creating DumpUrl entries...")
173+
# populate_dump_urls(collection)
173174

174175
print("Applying existing patterns; this may take a while")
175176
collection.apply_all_patterns()
@@ -253,7 +254,7 @@ def fetch_and_update_full_text(collection_id, server_name):
253254
if not (doc["url"] and doc["full_text"] and doc["title"]):
254255
continue
255256

256-
CandidateURL.objects.update_or_create(
257+
DumpUrl.objects.update_or_create(
257258
url=doc["url"],
258259
collection=collection,
259260
defaults={"scraped_text": doc["full_text"], "scraped_title": doc["title"]},

0 commit comments

Comments
 (0)