6
6
from django .apps import apps
7
7
from django .conf import settings
8
8
from django .core import management
9
+ from django .core .management .commands import loaddata
9
10
from django .db import IntegrityError
10
11
11
12
from config import celery_app
12
13
13
14
from .models .collection import Collection , WorkflowStatusChoices
14
- from .models .delta_url import CuratedUrl , DeltaUrl , DumpUrl
15
+ from .models .delta_url import DumpUrl
15
16
from .sinequa_api import Api
16
17
from .utils .github_helper import GitHubHandler
17
18
@@ -63,89 +64,6 @@ def _get_data_to_import(collection, server_name):
63
64
return data_to_import
64
65
65
66
66
- def _compare_and_populate_delta_urls (collection ):
67
- """Compare DumpUrl and CuratedUrl and populate DeltaUrl."""
68
- dump_urls = DumpUrl .objects .filter (collection = collection )
69
- curated_urls = CuratedUrl .objects .filter (collection = collection )
70
-
71
- DeltaUrl .objects .filter (collection = collection ).delete ()
72
-
73
- curated_urls_dict = {url .url : url for url in curated_urls }
74
-
75
- # Iterate over Dump URLs to find deltas
76
- for dump_url in dump_urls :
77
- curated_url = curated_urls_dict .get (dump_url .url )
78
-
79
- if not curated_url :
80
- # New URL found, add to DeltaUrl
81
- DeltaUrl .objects .create (
82
- collection = collection ,
83
- url = dump_url .url ,
84
- scraped_title = dump_url .scraped_title ,
85
- generated_title = dump_url .generated_title ,
86
- document_type = dump_url .document_type ,
87
- division = dump_url .division ,
88
- delete = False ,
89
- )
90
- elif (
91
- curated_url .scraped_title != dump_url .scraped_title
92
- or curated_url .generated_title != dump_url .generated_title
93
- or curated_url .document_type != dump_url .document_type
94
- or curated_url .division != dump_url .division
95
- ):
96
- # Metadata changed, add to DeltaUrl
97
- DeltaUrl .objects .create (
98
- collection = collection ,
99
- url = dump_url .url ,
100
- scraped_title = dump_url .scraped_title ,
101
- generated_title = dump_url .generated_title ,
102
- document_type = dump_url .document_type ,
103
- division = dump_url .division ,
104
- delete = False ,
105
- )
106
-
107
- # Mark any missing URLs in CuratedUrl as deleted in DeltaUrl
108
- dump_url_set = set (dump_urls .values_list ("url" , flat = True ))
109
- for curated_url in curated_urls :
110
- if curated_url .url not in dump_url_set :
111
- DeltaUrl .objects .create (
112
- collection = collection ,
113
- url = curated_url .url ,
114
- scraped_title = curated_url .scraped_title ,
115
- generated_title = curated_url .generated_title ,
116
- document_type = curated_url .document_type ,
117
- division = curated_url .division ,
118
- delete = True ,
119
- )
120
-
121
-
122
- # TODO: Bishwas wrote this but it is outdated.
123
- # def populate_dump_urls(collection):
124
- # urls = Url.objects.filter(collection=collection)
125
-
126
- # for url_instance in urls:
127
- # try:
128
- # # Create DumpUrl by passing in the parent Url fields
129
- # dump_url_instance = DumpUrl(
130
- # id=url_instance.id,
131
- # collection=url_instance.collection,
132
- # url=url_instance.url,
133
- # scraped_title=url_instance.scraped_title,
134
- # visited=url_instance.visited,
135
- # document_type=url_instance.document_type,
136
- # division=url_instance.division,
137
- # )
138
- # dump_url_instance.save() # Save both Url and DumpUrl entries
139
-
140
- # print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}")
141
-
142
- # except Exception as e:
143
- # print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}")
144
- # continue
145
-
146
- # print(f"Successfully populated DumpUrl model with {urls.count()} entries.")
147
-
148
-
149
67
@celery_app .task (soft_time_limit = 10000 )
150
68
def import_candidate_urls_from_api (server_name = "test" , collection_ids = []):
151
69
TEMP_FOLDER_NAME = "temp"
@@ -160,31 +78,26 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
160
78
data_to_import = _get_data_to_import (server_name = server_name , collection = collection )
161
79
print (f"Got { len (data_to_import )} records for { collection .config_folder } " )
162
80
163
- print ("Clearing DumpUrl model..." )
164
- DumpUrl .objects .filter (collection = collection ).delete ()
165
-
166
81
print ("Dumping django fixture to file" )
167
82
json .dump (data_to_import , open (urls_file , "w" ))
168
83
169
- print ("Loading data into Url model using loaddata..." )
170
- management .call_command ("loaddata" , urls_file )
84
+ print ("Deleting existing candidate URLs" )
85
+ # this sometimes takes a while
86
+ collection .candidate_urls .all ().delete ()
171
87
172
- # TODO: Bishwas wrote this but it is does not work.
173
- # print("Creating DumpUrl entries..." )
174
- # populate_dump_urls(collection )
88
+ print ( "Loading fixture; this may take a while" )
89
+ # subprocess.call(f'python manage.py loaddata "{urls_file}"', shell=True )
90
+ management . call_command ( loaddata . Command (), urls_file )
175
91
176
92
print ("Applying existing patterns; this may take a while" )
177
93
collection .apply_all_patterns ()
178
94
179
- print ("Comparing DumpUrl with CuratedUrl..." )
180
- _compare_and_populate_delta_urls (collection )
181
-
182
- if collection .workflow_status != WorkflowStatusChoices .ENGINEERING_IN_PROGRESS :
95
+ if collection .workflow_status == WorkflowStatusChoices .READY_FOR_ENGINEERING :
183
96
collection .workflow_status = WorkflowStatusChoices .ENGINEERING_IN_PROGRESS
184
97
collection .save ()
185
98
186
99
# Finally set the status to READY_FOR_CURATION
187
- # collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
100
+ collection .workflow_status = WorkflowStatusChoices .READY_FOR_CURATION
188
101
collection .save ()
189
102
190
103
print ("Deleting temp files" )
0 commit comments