Skip to content

Commit e934a9c

Browse files
committed
refactor status change logic and add indexing complete status
1 parent febee1b commit e934a9c

File tree

4 files changed

+170
-36
lines changed

4 files changed

+170
-36
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Generated by Django 4.2.9 on 2024-12-10 19:18
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("sde_collections", "0072_collection_reindexing_status_reindexinghistory"),
10+
]
11+
12+
operations = [
13+
migrations.AlterField(
14+
model_name="collection",
15+
name="workflow_status",
16+
field=models.IntegerField(
17+
choices=[
18+
(1, "Research in Progress"),
19+
(2, "Ready for Engineering"),
20+
(3, "Engineering in Progress"),
21+
(4, "Ready for Curation"),
22+
(5, "Curation in Progress"),
23+
(6, "Curated"),
24+
(7, "Quality Fixed"),
25+
(8, "Secret Deployment Started"),
26+
(9, "Secret Deployment Failed"),
27+
(10, "Ready for LRM Quality Check"),
28+
(11, "Ready for Quality Check"),
29+
(12, "QC: Failed"),
30+
(18, "QC: Minor Issues"),
31+
(13, "QC: Perfect"),
32+
(14, "Prod: Perfect"),
33+
(15, "Prod: Minor Issues"),
34+
(16, "Prod: Major Issues"),
35+
(17, "Code Merge Pending"),
36+
(19, "Delete from Prod"),
37+
(20, "Indexing Finished on LRM Dev"),
38+
],
39+
default=1,
40+
),
41+
),
42+
migrations.AlterField(
43+
model_name="workflowhistory",
44+
name="old_status",
45+
field=models.IntegerField(
46+
choices=[
47+
(1, "Research in Progress"),
48+
(2, "Ready for Engineering"),
49+
(3, "Engineering in Progress"),
50+
(4, "Ready for Curation"),
51+
(5, "Curation in Progress"),
52+
(6, "Curated"),
53+
(7, "Quality Fixed"),
54+
(8, "Secret Deployment Started"),
55+
(9, "Secret Deployment Failed"),
56+
(10, "Ready for LRM Quality Check"),
57+
(11, "Ready for Quality Check"),
58+
(12, "QC: Failed"),
59+
(18, "QC: Minor Issues"),
60+
(13, "QC: Perfect"),
61+
(14, "Prod: Perfect"),
62+
(15, "Prod: Minor Issues"),
63+
(16, "Prod: Major Issues"),
64+
(17, "Code Merge Pending"),
65+
(19, "Delete from Prod"),
66+
(20, "Indexing Finished on LRM Dev"),
67+
],
68+
null=True,
69+
),
70+
),
71+
migrations.AlterField(
72+
model_name="workflowhistory",
73+
name="workflow_status",
74+
field=models.IntegerField(
75+
choices=[
76+
(1, "Research in Progress"),
77+
(2, "Ready for Engineering"),
78+
(3, "Engineering in Progress"),
79+
(4, "Ready for Curation"),
80+
(5, "Curation in Progress"),
81+
(6, "Curated"),
82+
(7, "Quality Fixed"),
83+
(8, "Secret Deployment Started"),
84+
(9, "Secret Deployment Failed"),
85+
(10, "Ready for LRM Quality Check"),
86+
(11, "Ready for Quality Check"),
87+
(12, "QC: Failed"),
88+
(18, "QC: Minor Issues"),
89+
(13, "QC: Perfect"),
90+
(14, "Prod: Perfect"),
91+
(15, "Prod: Minor Issues"),
92+
(16, "Prod: Major Issues"),
93+
(17, "Code Merge Pending"),
94+
(19, "Delete from Prod"),
95+
(20, "Indexing Finished on LRM Dev"),
96+
],
97+
default=1,
98+
),
99+
),
100+
]

sde_collections/models/collection.py

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from slugify import slugify
1212

1313
from config_generation.db_to_xml import XmlEditor
14+
from sde_collections.tasks import fetch_and_replace_full_text
1415

1516
from ..utils.github_helper import GitHubHandler
1617
from ..utils.slack_utils import (
@@ -161,12 +162,6 @@ def migrate_dump_to_delta(self):
161162
# self.refresh_url_lists_for_all_patterns() # TODO: I'm pretty confident we shouldn't be running this
162163
self.apply_all_patterns()
163164

164-
# After migrating, check if we should update reindexing status
165-
curated_urls_count = self.curated_urls.count()
166-
if curated_urls_count > 0:
167-
self.reindexing_status = ReindexingStatusChoices.REINDEXING_READY_FOR_CURATION
168-
self.save()
169-
170165
def create_or_update_delta_url(self, url_instance, to_delete=False):
171166
"""
172167
Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object.
@@ -236,12 +231,6 @@ def promote_to_curated(self):
236231
# Step 4: Reapply patterns to DeltaUrls
237232
self.refresh_url_lists_for_all_patterns()
238233

239-
# After promoting, check if we should update reindexing status
240-
curated_urls_count = self.curated_urls.count()
241-
if curated_urls_count > 0:
242-
self.reindexing_status = ReindexingStatusChoices.REINDEXING_CURATED
243-
self.save()
244-
245234
def add_to_public_query(self):
246235
"""Add the collection to the public query."""
247236
if self.workflow_status not in [
@@ -788,20 +777,32 @@ def reindexing_status_button_color(self) -> str:
788777

789778
@receiver(post_save, sender=Collection)
790779
def create_configs_on_status_change(sender, instance, created, **kwargs):
791-
"""
792-
Creates various config files on certain workflow status changes
793-
"""
794-
795-
if "workflow_status" in instance.tracker.changed():
796-
if instance.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION:
797-
instance.create_plugin_config(overwrite=True)
798-
elif instance.workflow_status == WorkflowStatusChoices.CURATED:
799-
instance.promote_to_curated()
800-
elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
801-
instance.create_scraper_config(overwrite=False)
802-
instance.create_indexer_config(overwrite=False)
803-
elif instance.workflow_status in [
804-
WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
805-
WorkflowStatusChoices.QUALITY_CHECK_MINOR,
806-
]:
807-
instance.add_to_public_query()
780+
"""Creates various config files on certain workflow status changes"""
781+
782+
if getattr(instance, "_handling_status_change", False):
783+
return
784+
785+
try:
786+
instance._handling_status_change = True
787+
788+
if "workflow_status" in instance.tracker.changed():
789+
if instance.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION:
790+
instance.create_plugin_config(overwrite=True)
791+
elif instance.workflow_status == WorkflowStatusChoices.CURATED:
792+
instance.promote_to_curated()
793+
elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
794+
instance.create_scraper_config(overwrite=False)
795+
instance.create_indexer_config(overwrite=False)
796+
elif instance.workflow_status == WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV:
797+
fetch_and_replace_full_text.delay(instance.id, "lrm_dev")
798+
elif instance.workflow_status in [
799+
WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
800+
WorkflowStatusChoices.QUALITY_CHECK_MINOR,
801+
]:
802+
instance.add_to_public_query()
803+
804+
if "reindexing_status" in instance.tracker.changed():
805+
if instance.reindexing_status == ReindexingStatusChoices.REINDEXING_FINISHED_ON_DEV:
806+
fetch_and_replace_full_text.delay(instance.id, "lrm_dev")
807+
finally:
808+
instance._handling_status_change = False

sde_collections/models/collection_choice_fields.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ class WorkflowStatusChoices(models.IntegerChoices):
9797
PROD_MAJOR = 16, "Prod: Major Issues"
9898
MERGE_PENDING = 17, "Code Merge Pending"
9999
NEEDS_DELETE = 19, "Delete from Prod"
100+
INDEXING_FINISHED_ON_DEV = 20, "Indexing Finished on LRM Dev"
100101

101102

102103
class ReindexingStatusChoices(models.IntegerChoices):

sde_collections/tasks.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,11 @@
1010
from django.db import transaction
1111

1212
from config import celery_app
13+
from sde_collections.models.collection_choice_fields import (
14+
ReindexingStatusChoices,
15+
WorkflowStatusChoices,
16+
)
1317

14-
from .models.collection import Collection, WorkflowStatusChoices
1518
from .models.delta_url import DumpUrl
1619
from .sinequa_api import Api
1720
from .utils.github_helper import GitHubHandler
@@ -68,6 +71,7 @@ def _get_data_to_import(collection, server_name):
6871
def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
6972
TEMP_FOLDER_NAME = "temp"
7073
os.makedirs(TEMP_FOLDER_NAME, exist_ok=True)
74+
Collection = apps.get_model("sde_collections", "Collection")
7175

7276
collections = Collection.objects.filter(id__in=collection_ids)
7377

@@ -106,19 +110,25 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
106110

107111
@celery_app.task()
108112
def push_to_github_task(collection_ids):
113+
Collection = apps.get_model("sde_collections", "Collection")
114+
109115
collections = Collection.objects.filter(id__in=collection_ids)
110116
github_handler = GitHubHandler(collections)
111117
github_handler.push_to_github()
112118

113119

114120
@celery_app.task()
115121
def sync_with_production_webapp():
122+
Collection = apps.get_model("sde_collections", "Collection")
123+
116124
for collection in Collection.objects.all():
117125
collection.sync_with_production_webapp()
118126

119127

120128
@celery_app.task()
121129
def pull_latest_collection_metadata_from_github():
130+
Collection = apps.get_model("sde_collections", "Collection")
131+
122132
FILENAME = "github_collections.json"
123133

124134
gh = GitHubHandler(collections=Collection.objects.none())
@@ -149,21 +159,25 @@ def resolve_title_pattern(title_pattern_id):
149159
def fetch_and_replace_full_text(collection_id, server_name):
150160
"""
151161
Task to fetch and replace full text and metadata for a collection.
152-
Handles data in batches to manage memory usage.
162+
Handles data in batches to manage memory usage and updates appropriate statuses
163+
upon completion.
153164
"""
165+
Collection = apps.get_model("sde_collections", "Collection")
166+
154167
collection = Collection.objects.get(id=collection_id)
155168
api = Api(server_name)
156169

170+
initial_workflow_status = collection.workflow_status
171+
initial_reindexing_status = collection.reindexing_status
172+
157173
# Step 1: Delete existing DumpUrl entries
158174
deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
159175
print(f"Deleted {deleted_count} old records.")
160176

161-
# Step 2: Process data in batches
162-
total_processed = 0
163-
164177
try:
178+
# Step 2: Process data in batches
179+
total_processed = 0
165180
for batch in api.get_full_texts(collection.config_folder):
166-
# Use bulk_create for efficiency, with a transaction per batch
167181
with transaction.atomic():
168182
DumpUrl.objects.bulk_create(
169183
[
@@ -176,13 +190,31 @@ def fetch_and_replace_full_text(collection_id, server_name):
176190
for record in batch
177191
]
178192
)
179-
180193
total_processed += len(batch)
181194
print(f"Processed batch of {len(batch)} records. Total: {total_processed}")
182195

183196
# Step 3: Migrate dump URLs to delta URLs
184197
collection.migrate_dump_to_delta()
185198

199+
# Step 4: Update statuses if needed
200+
collection.refresh_from_db()
201+
202+
# Check workflow status transition
203+
pre_workflow_statuses = [
204+
WorkflowStatusChoices.RESEARCH_IN_PROGRESS,
205+
WorkflowStatusChoices.READY_FOR_ENGINEERING,
206+
WorkflowStatusChoices.ENGINEERING_IN_PROGRESS,
207+
WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV,
208+
]
209+
if initial_workflow_status in pre_workflow_statuses:
210+
collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
211+
collection.save()
212+
213+
# Check reindexing status transition
214+
if initial_reindexing_status == ReindexingStatusChoices.REINDEXING_FINISHED_ON_DEV:
215+
collection.reindexing_status = ReindexingStatusChoices.REINDEXING_READY_FOR_CURATION
216+
collection.save()
217+
186218
return f"Successfully processed {total_processed} records and updated the database."
187219

188220
except Exception as e:

0 commit comments

Comments
 (0)