diff --git a/config/settings/local.py b/config/settings/local.py index c7b8416a..e19099d2 100644 --- a/config/settings/local.py +++ b/config/settings/local.py @@ -1,3 +1,5 @@ +from datetime import timedelta + from .base import * # noqa from .base import env @@ -68,3 +70,13 @@ # Cross origin resource sharing set-up for development CORS_ALLOW_ALL_ORIGINS = True # For development + +# Define how often to reindex collections in days +COLLECTION_REINDEX_INTERVAL_DAYS = 60 + +CELERY_BEAT_SCHEDULE = { + "check-collections-reindexing": { + "task": "sde_collections.tasks.check_collections_reindexing_needed", + "schedule": timedelta(days=1), # Run daily + }, +} diff --git a/sde_collections/management/commands/check_collections_reindexing_needed.py b/sde_collections/management/commands/check_collections_reindexing_needed.py new file mode 100644 index 00000000..e387e6f1 --- /dev/null +++ b/sde_collections/management/commands/check_collections_reindexing_needed.py @@ -0,0 +1,117 @@ +# Usage: docker-compose -f local.yml run --rm django python manage.py check_collections_reindexing_needed + +from datetime import timedelta + +from django.conf import settings +from django.core.management.base import BaseCommand +from django.utils import timezone + + +class Command(BaseCommand): + """ + Management command to identify collections that need reindexing based on two criteria: + 1. Collections previously reindexed on prod (REINDEXING_INDEXED_ON_PROD) over 2 months ago + 2. Collections that reached PROD_PERFECT over 2 months ago and haven't been reindexed yet + """ + + help = "Identifies and marks collections that need reindexing based on time threshold" + + def handle(self, *args, **options): + # Import models here to avoid circular imports + from sde_collections.models.collection import ( + Collection, + ReindexingHistory, + WorkflowHistory, + ) + from sde_collections.models.collection_choice_fields import ( + ReindexingStatusChoices, + WorkflowStatusChoices, + ) + + self.stdout.write( + self.style.SUCCESS( + "\n=== Collection Reindexing Check ===\n" + f"Threshold: {settings.COLLECTION_REINDEX_INTERVAL_DAYS} days\n" + ) + ) + + threshold = timezone.now() - timedelta(days=settings.COLLECTION_REINDEX_INTERVAL_DAYS) + collections_to_update = [] + + # Case 1: Collections that were previously reindexed on prod + prod_reindexed_collections = Collection.objects.filter( + reindexing_status=ReindexingStatusChoices.REINDEXING_INDEXED_ON_PROD + ) + + self.stdout.write( + f"\nChecking {prod_reindexed_collections.count()} collections that were " + f"reindexed on prod (REINDEXING_INDEXED_ON_PROD)..." + ) + + for collection in prod_reindexed_collections: + # Get latest reindexing history + latest_history = ReindexingHistory.objects.filter(collection=collection).order_by("-created_at").first() + + if not latest_history or latest_history.created_at <= threshold: + collections_to_update.append(collection) + self.stdout.write( + f"Collection {collection.id} [{collection.name}] needs reindexing - " + f"Last Reindexed: {latest_history.created_at if latest_history else 'Never'}" + ) + + # Case 2: Collections that completed first-time indexing (PROD_PERFECT) + first_time_collections = Collection.objects.filter( + workflow_status=WorkflowStatusChoices.PROD_PERFECT, + reindexing_status=ReindexingStatusChoices.REINDEXING_NOT_NEEDED, + # We don't want to target those collections which are already going through some reindexing processes + ) + + self.stdout.write( + f"\nChecking {first_time_collections.count()} collections that are in PROD_PERFECT workflow status..." + ) + + for collection in first_time_collections: + # Get when collection reached PROD_PERFECT + prod_perfect_history = ( + WorkflowHistory.objects.filter( + collection=collection, workflow_status=WorkflowStatusChoices.PROD_PERFECT + ) + .order_by("-created_at") + .first() + ) + + if not prod_perfect_history or prod_perfect_history.created_at <= threshold: + collections_to_update.append(collection) + self.stdout.write( + f"Collection {collection.id:<5} [{collection.name:<60}] needs reindexing - " + f"In PROD_PERFECT since: {prod_perfect_history.created_at if prod_perfect_history else 'Unknown'}" + ) + + # Show summary and ask for confirmation + self.stdout.write( + self.style.MIGRATE_HEADING( + f"\nSummary:" + f"\n- Total collections to update: {len(collections_to_update)}" + f"\n- Will be marked as: {ReindexingStatusChoices.REINDEXING_NEEDED_ON_DEV.label}" + "\n- First 5 collections will be processed in this test run" + ) + ) + + if collections_to_update: + user_input = input("Do you want to mark these collections for reindexing? (yes/no)") + + if user_input.lower() == "yes": + # Process first 5 collections only for testing + for collection in collections_to_update[:5]: + collection.reindexing_status = ReindexingStatusChoices.REINDEXING_NEEDED_ON_DEV + collection.save() + self.stdout.write(f"✓ Marked collection: {collection.name}") + + self.stdout.write( + self.style.SUCCESS( + f"\nSuccessfully marked {len(collections_to_update[:5])} collections for reindexing" + "\nNote: Only processed first 5 collections in test mode" + ) + ) + else: + self.stdout.write(self.style.WARNING("\nUpdate Cancelled!")) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 98f26a34..8de6675c 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -2,6 +2,7 @@ import json import os import shutil +from datetime import timedelta import boto3 from django.apps import apps @@ -9,6 +10,7 @@ from django.core import management from django.core.management.commands import loaddata from django.db import transaction +from django.utils import timezone from config import celery_app from sde_collections.models.collection_choice_fields import ( @@ -229,3 +231,67 @@ def migrate_dump_to_delta_and_handle_status_transistions(collection_id): collection.save() return f"Successfully migrated DumpUrls to DeltaUrls for collection {collection.name}." + + +@celery_app.task() +def check_collections_reindexing_needed(): + """ + Task to identify collections that need reindexing based on two criteria: + 1. Collections previously reindexed on prod (REINDEXING_INDEXED_ON_PROD) over 2 months ago + 2. Collections that reached PROD_PERFECT over 2 months ago and haven't been reindexed yet + """ + + from sde_collections.models.collection import ( + Collection, + ReindexingHistory, + WorkflowHistory, + ) + from sde_collections.models.collection_choice_fields import ( + ReindexingStatusChoices, + WorkflowStatusChoices, + ) + + threshold = timezone.now() - timedelta(days=settings.COLLECTION_REINDEX_INTERVAL_DAYS) + collections_to_update = [] + + # Case 1: Collections that were previously reindexed on prod + prod_reindexed_collections = Collection.objects.filter( + reindexing_status=ReindexingStatusChoices.REINDEXING_INDEXED_ON_PROD + ) + print( + f"\nChecking {prod_reindexed_collections.count()} collections that were " + f"reindexed on prod (REINDEXING_INDEXED_ON_PROD)..." + ) + + for collection in prod_reindexed_collections: + latest_history = ReindexingHistory.objects.filter(collection=collection).order_by("-created_at").first() + + if not latest_history or latest_history.created_at <= threshold: + collections_to_update.append(collection) + + # Case 2: Collections that completed first-time indexing (PROD_PERFECT) + first_time_collections = Collection.objects.filter( + workflow_status=WorkflowStatusChoices.PROD_PERFECT, + reindexing_status=ReindexingStatusChoices.REINDEXING_NOT_NEEDED, + ) + print(f"\nChecking {first_time_collections.count()} collections that are in PROD_PERFECT workflow status...") + + for collection in first_time_collections: + prod_perfect_history = ( + WorkflowHistory.objects.filter(collection=collection, workflow_status=WorkflowStatusChoices.PROD_PERFECT) + .order_by("-created_at") + .first() + ) + + if not prod_perfect_history or prod_perfect_history.created_at <= threshold: + collections_to_update.append(collection) + + # Update all collections + print(f"\nFound {len(collections_to_update)} collections that need reindexing") + + for collection in collections_to_update: + collection.reindexing_status = ReindexingStatusChoices.REINDEXING_NEEDED_ON_DEV + collection.save() + + print(f"\nSuccessfully marked {len(collections_to_update)} collections for reindexing") + return f"Marked {len(collections_to_update)} collections for reindexing"