88from django .core import management
99
1010from config import celery_app
11- from sde_collections .models .candidate_url import CandidateURL
1211
1312from .models .collection import Collection , WorkflowStatusChoices
1413from .models .delta_url import CuratedUrl , DeltaUrl , DumpUrl
@@ -119,30 +118,31 @@ def _compare_and_populate_delta_urls(collection):
119118 )
120119
121120
122- def populate_dump_urls (collection ):
123- urls = Url .objects .filter (collection = collection )
124-
125- for url_instance in urls :
126- try :
127- # Create DumpUrl by passing in the parent Url fields
128- dump_url_instance = DumpUrl (
129- id = url_instance .id ,
130- collection = url_instance .collection ,
131- url = url_instance .url ,
132- scraped_title = url_instance .scraped_title ,
133- visited = url_instance .visited ,
134- document_type = url_instance .document_type ,
135- division = url_instance .division ,
136- )
137- dump_url_instance .save () # Save both Url and DumpUrl entries
121+ # TODO: Bishwas wrote this but it is outdated.
122+ # def populate_dump_urls(collection):
123+ # urls = Url.objects.filter(collection=collection)
138124
139- print (f"Created DumpUrl: { dump_url_instance .url } - { dump_url_instance .scraped_title } " )
125+ # for url_instance in urls:
126+ # try:
127+ # # Create DumpUrl by passing in the parent Url fields
128+ # dump_url_instance = DumpUrl(
129+ # id=url_instance.id,
130+ # collection=url_instance.collection,
131+ # url=url_instance.url,
132+ # scraped_title=url_instance.scraped_title,
133+ # visited=url_instance.visited,
134+ # document_type=url_instance.document_type,
135+ # division=url_instance.division,
136+ # )
137+ # dump_url_instance.save() # Save both Url and DumpUrl entries
140138
141- except Exception as e :
142- print (f"Error creating DumpUrl for { url_instance .url } : { str (e )} " )
143- continue
139+ # print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}")
140+
141+ # except Exception as e:
142+ # print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}")
143+ # continue
144144
145- print (f"Successfully populated DumpUrl model with { urls .count ()} entries." )
145+ # print(f"Successfully populated DumpUrl model with {urls.count()} entries.")
146146
147147
148148@celery_app .task (soft_time_limit = 10000 )
@@ -168,8 +168,9 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
168168 print ("Loading data into Url model using loaddata..." )
169169 management .call_command ("loaddata" , urls_file )
170170
171- print ("Creating DumpUrl entries..." )
172- populate_dump_urls (collection )
171+ # TODO: Bishwas wrote this but it is does not work.
172+ # print("Creating DumpUrl entries...")
173+ # populate_dump_urls(collection)
173174
174175 print ("Applying existing patterns; this may take a while" )
175176 collection .apply_all_patterns ()
@@ -253,7 +254,7 @@ def fetch_and_update_full_text(collection_id, server_name):
253254 if not (doc ["url" ] and doc ["full_text" ] and doc ["title" ]):
254255 continue
255256
256- CandidateURL .objects .update_or_create (
257+ DumpUrl .objects .update_or_create (
257258 url = doc ["url" ],
258259 collection = collection ,
259260 defaults = {"scraped_text" : doc ["full_text" ], "scraped_title" : doc ["title" ]},
0 commit comments