diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ab909b0..a12582a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -183,3 +183,12 @@ For each PR made, an entry should be added to this changelog. It should contain - physics_of_the_cosmos - stsci_space_telescope_science_institute - Once the front end has been updated to allow for tag edits, all astrophysics collections will be marked to be run through the pipeline + +- 1295-asynchronous-metrics-download-in-admin-panel + - Description: Implemented asynchronous metrics download in Django admin + - Changes: + - Button Addition:Integrated a 'metrics' button + - Task Generation: download_metrics handles button clicks to initiate a Celery task, display a download link and manage redirection. + - Cleanup Mechanism: Cleans up old metrics files in the directory, keeping only current task related files. + - Background Processing: Runs generate_metrics task asynchronously to gather data and generate a CSV in MEDIA_ROOT/metrics/ + - File Retrieval: get_metrics_file checks file availability and size, providing a download if ready or status messages for in progress files. diff --git a/requirements/base.txt b/requirements/base.txt index a4ca9cf8..0b81d058 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -34,3 +34,4 @@ tenacity==8.2.2 tqdm==4.66.3 unidecode==1.3.8 xmltodict==0.13.0 +numpy==1.24.3 diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 766d8f3f..5a45bc60 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -1,8 +1,13 @@ import csv +import os +import uuid from django import forms +from django.conf import settings from django.contrib import admin, messages -from django.http import HttpResponse +from django.http import FileResponse, HttpResponse +from django.urls import path +from django.utils.safestring import mark_safe from sde_collections.models.delta_patterns import ( DeltaDivisionPattern, @@ -15,7 +20,7 @@ from .models.collection_choice_fields import TDAMMTags from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .tasks import fetch_full_text, import_candidate_urls_from_api +from .tasks import fetch_full_text, generate_metrics, import_candidate_urls_from_api def fetch_and_replace_text_for_server(modeladmin, request, queryset, server_name): @@ -296,6 +301,84 @@ def included_curated_urls_count(self, obj) -> int: ] ordering = ("cleaning_order",) + def changelist_view(self, request, extra_context=None): + """ + To add a button for metrics download + """ + extra_context = extra_context or {} + extra_context["show_metrics_button"] = True + extra_context["metrics_url"] = request.path + "metrics/" + return super().changelist_view(request, extra_context=extra_context) + + def get_urls(self): + """ + To add custom endpoints for metrics functionality + """ + urls = super().get_urls() + custom_urls = [ + path( + "metrics/", self.admin_site.admin_view(self.download_metrics), name="sde_collections_collection_metrics" + ), + path( + "metrics//", + self.admin_site.admin_view(self.get_metrics_file), + name="sde_collections_get_metrics", + ), + ] + return custom_urls + urls + + def download_metrics(self, request): + """Custom view that starts metrics generation and returns to collection list""" + task_id = str(uuid.uuid4()) + generate_metrics.delay(task_id) + + download_url = request.path.rsplit("metrics/", 1)[0] + f"metrics/{task_id}/" + + messages.add_message( + request, + messages.INFO, + mark_safe( + f"Metrics generation started. Please wait a moment and then " + f"click here to download when ready." + ), + ) + return HttpResponse(status=303, headers={"Location": request.path.replace("/metrics/", "")}) + + def get_metrics_file(self, request, task_id): + """Serve the generated metrics file if it exists and is valid""" + + file_path = os.path.join(settings.MEDIA_ROOT, "metrics", f"metrics_{task_id}.csv") + + # Create the retry URL + current_url = request.build_absolute_uri() + + # Check if file exists and is not empty (minimum size 100 bytes) + if os.path.exists(file_path) and os.path.getsize(file_path) > 100: + response = FileResponse(open(file_path, "rb"), content_type="text/csv") + response["Content-Disposition"] = 'attachment; filename="metrics.csv"' + return response + else: + # Also check if there's a temporary file indicating task is still running + temp_file_path = os.path.join(settings.MEDIA_ROOT, "metrics", f"metrics_{task_id}.tmp") + if os.path.exists(temp_file_path): + messages.add_message( + request, + messages.INFO, + mark_safe( + f"The metrics file is still being generated. " + f"Click here to try again." + ), + ) + else: + messages.add_message( + request, + messages.WARNING, + mark_safe( + f"The metrics file is not ready yet. " f"Click here to try again." + ), + ) + return HttpResponse(status=303, headers={"Location": request.path.replace(f"/metrics/{task_id}/", "")}) + @admin.action(description="Exclude URL and all children") def exclude_pattern(modeladmin, request, queryset): diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 98f26a34..859bdade 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -1,9 +1,12 @@ # /sde_collections/tasks.py +import csv import json import os import shutil import boto3 +import numpy as np +import requests from django.apps import apps from django.conf import settings from django.core import management @@ -229,3 +232,292 @@ def migrate_dump_to_delta_and_handle_status_transistions(collection_id): collection.save() return f"Successfully migrated DumpUrls to DeltaUrls for collection {collection.name}." + + +@celery_app.task(name="generate_metrics") +def generate_metrics(task_id): + """ + Asynchronously generate metrics and save to a downloadable file + """ + try: + # Generate a file path in the media directory + metrics_dir = os.path.join(settings.MEDIA_ROOT, "metrics") + os.makedirs(metrics_dir, exist_ok=True) + + # Clean up old metrics files + for filename in os.listdir(metrics_dir): + if filename.startswith("metrics_") and (filename.endswith(".csv") or filename.endswith(".tmp")): + # Skip the current task's files + if not filename.startswith(f"metrics_{task_id}"): + file_path = os.path.join(metrics_dir, filename) + try: + os.remove(file_path) + print(f"Deleted old metrics file: {filename}") + except Exception as e: + print(f"Failed to delete {filename}: {str(e)}") + + # Use a temporary file during generation + temp_file_path = os.path.join(metrics_dir, f"metrics_{task_id}.tmp") + final_file_path = os.path.join(metrics_dir, f"metrics_{task_id}.csv") + + # Initialize common variables + divArr = [ + "/Astrophysics/", + "/Biological and Physical Sciences/", + "/Earth Science/", + "/Heliophysics/", + "/Planetary Science/", + ] + docArr = ["Data", "Images", "Documentation", "Software and Tools", "Missions and Instruments"] + + url = "https://sciencediscoveryengine.nasa.gov/api/v1/search.query" + + # Create a temporary buffer for CSV data + with open(temp_file_path, "w", newline="") as csvfile: + writer = csv.writer(csvfile) + + # SECTION 1: DOCUMENT COUNT METRICS + writer.writerow(["SECTION 1: DOCUMENT COUNT BY DIVISION AND DOCUMENT TYPE"]) + writer.writerow([]) + + # Initialize document count array + recCounter = np.zeros((6, 5)) + + # Process documents through API + pageCount = 1 + pageSize = 1000 + + print("Fetching document count metrics...") + + while True: + payload = { + "app": "nasa-sba-smd", + "query": { + "name": "query-smd-primary", + "page": pageCount, + "pageSize": pageSize, + "scope": "All", + "advanced": {}, + }, + } + + print(f"Processing page {pageCount} for document count") + response_data = requests.post(url, headers={}, json=payload, verify=False).json() + + num_records = len(response_data.get("records", [])) + if num_records == 0: + break + + for record in response_data["records"]: + if "sourcestr56" in record: + # Extract division and document type + division = record.get("treepath", [""])[0] + doctype = record["sourcestr56"] + + # Update counters + for j in range(0, len(docArr)): + for k in range(0, len(divArr)): + if doctype == docArr[j]: + if divArr[k] in division: + recCounter[k, j] += 1 + recCounter[5, j] += 1 + + pageCount += 1 + + # Write document count metrics + writer.writerow(["Division", "Document Type", "Count"]) + + # Add All Divisions data + writer.writerow(["All Divisions", "Data", str(int(recCounter[5, 0]))]) + writer.writerow(["All Divisions", "Images", str(int(recCounter[5, 1]))]) + writer.writerow(["All Divisions", "Documentation", str(int(recCounter[5, 2]))]) + writer.writerow(["All Divisions", "Software and Tools", str(int(recCounter[5, 3]))]) + writer.writerow(["All Divisions", "Missions and Instruments", str(int(recCounter[5, 4]))]) + writer.writerow(["All Divisions", "Total", str(int(np.sum(recCounter[5, :])))]) + + # Add data for each division + division_names = [ + "Astrophysics", + "Biological and Physical Sciences", + "Earth Science", + "Heliophysics", + "Planetary Science", + ] + for i, div_name in enumerate(division_names): + writer.writerow([div_name, "Data", str(int(recCounter[i, 0]))]) + writer.writerow([div_name, "Images", str(int(recCounter[i, 1]))]) + writer.writerow([div_name, "Documentation", str(int(recCounter[i, 2]))]) + writer.writerow([div_name, "Software and Tools", str(int(recCounter[i, 3]))]) + writer.writerow([div_name, "Missions and Instruments", str(int(recCounter[i, 4]))]) + writer.writerow([div_name, "Total", str(int(np.sum(recCounter[i, :])))]) + + # SECTION 2: SOURCES COUNT + writer.writerow([]) + writer.writerow(["SECTION 2: SOURCES COUNT BY DIVISION"]) + writer.writerow([]) + + # Initialize sources and divisions lists + sources = [] + divs = [] + + # Reset page counter for new query + pageCount = 1 + + print("Fetching sources count metrics...") + + while True: + payload = { + "app": "nasa-sba-smd", + "query": { + "name": "query-smd-primary", + "page": pageCount, + "pageSize": pageSize, + "scope": "All", + "advanced": {}, + }, + } + + print(f"Processing page {pageCount} for sources count") + response_data = requests.post(url, headers={}, json=payload, verify=False).json() + + num_records = len(response_data.get("records", [])) + if num_records == 0: + break + + for record in response_data["records"]: + if "sourcestr56" in record: + source_name = record.get("collection", [""])[0] + division = record.get("treepath", [""])[0] + + if source_name not in sources: + sources.append(source_name) + divs.append(division) + + pageCount += 1 + + # Count sources by division + division_counts = {div: 0 for div in divArr} + total_count = len(sources) + + for i, div_path in enumerate(divs): + for div in divArr: + if div in div_path: + division_counts[div] += 1 + + # Write sources count metrics + writer.writerow(["Division", "Count"]) + + for div in divArr: + writer.writerow([div.strip("/"), str(division_counts[div])]) + + writer.writerow(["Total", str(total_count)]) + + # SECTION 3: DOCUMENTS BY SOURCE + writer.writerow([]) + writer.writerow(["SECTION 3: DOCUMENTS BY SOURCE"]) + writer.writerow([]) + + # List of new sources to track + new_sources = [ + "AIM: Aeronomy of Ice in the Mesosphere", + "ASDC: Atmospheric Science Data Center", + "G-LiHT", + "GENESIS: Global Environmental & Earth Science Information System", + "HyTES: Hyperspectral Thermal Emission Spectrometer", + "IBM-NASA Prithvi Models Family", + "COMET ASTEROID TELESCOPIC CATALOG HUNTER", + "Escape and Plasma Acceleration and Dynamics Explorers (ESCAPADE)", + "Extreme Ultraviolet Variability Experiment (EVE)", + "Crustal Dynamics Data Information System", + "Aura Atmospheric Chemistry", + "LDOPE: Land Data Operational Products Evaluation", + "CPL: Cloud Physics Lidar", + "Direct Readout Laboratory", + "Center for Near Earth Object Studies (CNEOS)", + ] + + # Initialize counter array for source documents + nsCounter = np.zeros((len(new_sources), 5, 5)) + + # Reset page counter for new query + pageCount = 1 + + print("Fetching source documents metrics...") + + while True: + payload = { + "app": "nasa-sba-smd", + "query": { + "name": "query-smd-primary", + "page": pageCount, + "pageSize": pageSize, + "scope": "All", + "advanced": {}, + }, + } + + print(f"Processing page {pageCount} for source documents") + response_data = requests.post(url, headers={}, json=payload, verify=False).json() + + num_records = len(response_data.get("records", [])) + if num_records == 0: + break + + for record in response_data["records"]: + if "sourcestr56" in record: + source_name = record.get("treepath", [""])[0] + division = record.get("treepath", [""])[0] + doc_type = record.get("sourcestr56", "") + + for n, new_source in enumerate(new_sources): + if new_source in source_name: + for j, doc in enumerate(docArr): + if doc_type == doc: + for k, div in enumerate(divArr): + if div in division: + nsCounter[n, k, j] += 1 + + pageCount += 1 + + # Write source documents metrics + for n, source in enumerate(new_sources): + total_source_docs = np.sum(nsCounter[n, :, :]) + writer.writerow([f"Source: {source}"]) + writer.writerow(["The total number of new documents:", str(int(total_source_docs))]) + + for j, doc_type in enumerate(docArr): + doc_count = np.sum(nsCounter[n, :, j]) + writer.writerow([f"The total number of new {doc_type.lower()} documents:", str(int(doc_count))]) + writer.writerow([]) + + writer.writerow(["DIVISION BREAKDOWN"]) + writer.writerow([]) + + for d, division in enumerate(divArr): + div_name = division.strip("/") + total_docs = np.sum(nsCounter[:, d, :]) + writer.writerow([f"Division: {div_name}"]) + writer.writerow(["The total number of new documents:", str(int(total_docs))]) + + for t, doc_type in enumerate(docArr): + doc_count = np.sum(nsCounter[:, d, t]) + writer.writerow([f"The total number of new {docArr[t].lower()} documents:", str(int(doc_count))]) + writer.writerow([]) + + total_overall = np.sum(nsCounter[:, :, :]) + writer.writerow(["All Divisions"]) + writer.writerow(["The total number of new documents:", str(int(total_overall))]) + + for t, doc_type in enumerate(docArr): + total_doc_type = np.sum(nsCounter[:, :, t]) + writer.writerow([f"The total number of new {docArr[t].lower()} documents:", str(int(total_doc_type))]) + + # Move the file to its final location when completed + shutil.move(temp_file_path, final_file_path) + + print(f"Metrics generation complete. File saved to {final_file_path}") + return True + + except Exception as e: + print(f"Error in generate_metrics: {str(e)}") + return False diff --git a/sde_collections/templates/admin/sde_collections/collection/change_list.html b/sde_collections/templates/admin/sde_collections/collection/change_list.html new file mode 100644 index 00000000..616c0165 --- /dev/null +++ b/sde_collections/templates/admin/sde_collections/collection/change_list.html @@ -0,0 +1,15 @@ +{% extends "admin/change_list.html" %} +{% load i18n %} + +{% block object-tools %} + +{% endblock %}