From cd7bf037f5d67dfd77b0fa1b508eb3ef1fc53f00 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 31 Mar 2025 13:56:59 -0500 Subject: [PATCH 1/5] Metrics Download Feature_01 --- sde_collections/admin.py | 73 ++++- sde_collections/tasks.py | 268 ++++++++++++++++++ .../collection/change_list.html | 15 + 3 files changed, 354 insertions(+), 2 deletions(-) create mode 100644 sde_collections/templates/admin/sde_collections/collection/change_list.html diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 766d8f3f..8f897eea 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -2,7 +2,12 @@ from django import forms from django.contrib import admin, messages -from django.http import HttpResponse +from django.http import HttpResponse, FileResponse +from django.utils.safestring import mark_safe +from django.urls import path +import uuid +import os +from django.conf import settings from sde_collections.models.delta_patterns import ( DeltaDivisionPattern, @@ -15,7 +20,7 @@ from .models.collection_choice_fields import TDAMMTags from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .tasks import fetch_full_text, import_candidate_urls_from_api +from .tasks import fetch_full_text, import_candidate_urls_from_api, generate_metrics def fetch_and_replace_text_for_server(modeladmin, request, queryset, server_name): @@ -295,6 +300,70 @@ def included_curated_urls_count(self, obj) -> int: fetch_full_text_xli_action, ] ordering = ("cleaning_order",) + + def changelist_view(self, request, extra_context=None): + """ + To add a button for metrics download + """ + extra_context = extra_context or {} + extra_context['show_metrics_button'] = True + extra_context['metrics_url'] = request.path + 'metrics/' + return super().changelist_view(request, extra_context=extra_context) + + def get_urls(self): + """ + To add custom endpoints for metrics functionality + """ + urls = super().get_urls() + custom_urls = [ + path('metrics/', self.admin_site.admin_view(self.download_metrics), name='sde_collections_collection_metrics'), + path('metrics//', self.admin_site.admin_view(self.get_metrics_file), name='sde_collections_get_metrics'), + ] + return custom_urls + urls + + def download_metrics(self, request): + """Custom view that starts metrics generation and returns to collection list""" + task_id = str(uuid.uuid4()) + task = generate_metrics.delay(task_id) + + download_url = request.path.rsplit('metrics/', 1)[0] + f'metrics/{task_id}/' + + messages.add_message( + request, + messages.INFO, + mark_safe(f"Metrics generation started. Please wait a moment and then click here to download when ready.") + ) + return HttpResponse(status=303, headers={"Location": request.path.replace('/metrics/', '')}) + + def get_metrics_file(self, request, task_id): + """Serve the generated metrics file if it exists and is valid""" + + file_path = os.path.join(settings.MEDIA_ROOT, 'metrics', f'metrics_{task_id}.csv') + + # Create the retry URL + current_url = request.build_absolute_uri() + + # Check if file exists and is not empty (minimum size 100 bytes) + if os.path.exists(file_path) and os.path.getsize(file_path) > 100: + response = FileResponse(open(file_path, 'rb'), content_type='text/csv') + response['Content-Disposition'] = 'attachment; filename="metrics.csv"' + return response + else: + # Also check if there's a temporary file indicating task is still running + temp_file_path = os.path.join(settings.MEDIA_ROOT, 'metrics', f'metrics_{task_id}.tmp') + if os.path.exists(temp_file_path): + messages.add_message( + request, + messages.INFO, + mark_safe(f"The metrics file is still being generated. Click here to try again.") + ) + else: + messages.add_message( + request, + messages.WARNING, + mark_safe(f"The metrics file is not ready yet. Click here to try again.") + ) + return HttpResponse(status=303, headers={"Location": request.path.replace(f'/metrics/{task_id}/', '')}) @admin.action(description="Exclude URL and all children") diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 98f26a34..2ca2de06 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -2,6 +2,9 @@ import json import os import shutil +import csv +import requests +import numpy as np import boto3 from django.apps import apps @@ -229,3 +232,268 @@ def migrate_dump_to_delta_and_handle_status_transistions(collection_id): collection.save() return f"Successfully migrated DumpUrls to DeltaUrls for collection {collection.name}." + + +@celery_app.task(name="generate_metrics") +def generate_metrics(task_id): + """ + To asynchronously generate metrics + """ + try: + # Generate a file path in the media directory + metrics_dir = os.path.join(settings.MEDIA_ROOT, 'metrics') + os.makedirs(metrics_dir, exist_ok=True) + + # Use a temporary file during generation + temp_file_path = os.path.join(metrics_dir, f'metrics_{task_id}.tmp') + final_file_path = os.path.join(metrics_dir, f'metrics_{task_id}.csv') + + # Initialize common variables + divArr = ['/Astrophysics/','/Biological and Physical Sciences/','/Earth Science/','/Heliophysics/','/Planetary Science/'] + docArr = ['Data','Images','Documentation','Software and Tools','Missions and Instruments'] + + url = "https://sciencediscoveryengine.nasa.gov/api/v1/search.query" + + # Create a temporary buffer for CSV data + with open(temp_file_path, 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + + # SECTION 1: DOCUMENT COUNT METRICS + writer.writerow(['SECTION 1: DOCUMENT COUNT BY DIVISION AND DOCUMENT TYPE']) + writer.writerow([]) + + # Initialize document count array + recCounter = np.zeros((6,5)) + + # Process documents through API + pageCount = 1 + pageSize = 1000 + + print("Fetching document count metrics...") + + while True: + payload = { + "app": 'nasa-sba-smd', + "query": { + "name": 'query-smd-primary', + "page": pageCount, + "pageSize": pageSize, + "scope": 'All', + "advanced": {}, + }, + } + + print(f"Processing page {pageCount} for document count") + response_data = requests.post(url, headers={}, json=payload, verify=False).json() + + num_records = len(response_data.get('records', [])) + if num_records == 0: + break + + for record in response_data['records']: + if 'sourcestr56' in record: + # Extract division and document type + division = record.get('treepath', [''])[0] + doctype = record['sourcestr56'] + + # Update counters + for j in range(0, len(docArr)): + for k in range(0, len(divArr)): + if doctype == docArr[j]: + if divArr[k] in division: + recCounter[k, j] += 1 + recCounter[5, j] += 1 + + pageCount += 1 + + # Write document count metrics + writer.writerow(['Division', 'Document Type', 'Count']) + + # Add All Divisions data + writer.writerow(['All Divisions', 'Data', str(int(recCounter[5, 0]))]) + writer.writerow(['All Divisions', 'Images', str(int(recCounter[5, 1]))]) + writer.writerow(['All Divisions', 'Documentation', str(int(recCounter[5, 2]))]) + writer.writerow(['All Divisions', 'Software and Tools', str(int(recCounter[5, 3]))]) + writer.writerow(['All Divisions', 'Missions and Instruments', str(int(recCounter[5, 4]))]) + writer.writerow(['All Divisions', 'Total', str(int(np.sum(recCounter[5, :])))]) + + # Add data for each division + division_names = ['Astrophysics', 'Biological and Physical Sciences', 'Earth Science', 'Heliophysics', 'Planetary Science'] + for i, div_name in enumerate(division_names): + writer.writerow([div_name, 'Data', str(int(recCounter[i, 0]))]) + writer.writerow([div_name, 'Images', str(int(recCounter[i, 1]))]) + writer.writerow([div_name, 'Documentation', str(int(recCounter[i, 2]))]) + writer.writerow([div_name, 'Software and Tools', str(int(recCounter[i, 3]))]) + writer.writerow([div_name, 'Missions and Instruments', str(int(recCounter[i, 4]))]) + writer.writerow([div_name, 'Total', str(int(np.sum(recCounter[i, :])))]) + + # SECTION 2: SOURCES COUNT + writer.writerow([]) + writer.writerow(['SECTION 2: SOURCES COUNT BY DIVISION']) + writer.writerow([]) + + # Initialize sources and divisions lists + sources = [] + divs = [] + + # Reset page counter for new query + pageCount = 1 + + print("Fetching sources count metrics...") + + while True: + payload = { + "app": 'nasa-sba-smd', + "query": { + "name": 'query-smd-primary', + "page": pageCount, + "pageSize": pageSize, + "scope": 'All', + "advanced": {}, + }, + } + + print(f"Processing page {pageCount} for sources count") + response_data = requests.post(url, headers={}, json=payload, verify=False).json() + + num_records = len(response_data.get('records', [])) + if num_records == 0: + break + + for record in response_data['records']: + if 'sourcestr56' in record: + source_name = record.get('collection', [''])[0] + division = record.get('treepath', [''])[0] + + if source_name not in sources: + sources.append(source_name) + divs.append(division) + + pageCount += 1 + + # Count sources by division + division_counts = {div: 0 for div in divArr} + total_count = len(sources) + + for i, div_path in enumerate(divs): + for div in divArr: + if div in div_path: + division_counts[div] += 1 + + # Write sources count metrics + writer.writerow(['Division', 'Count']) + + for div in divArr: + writer.writerow([div.strip('/'), str(division_counts[div])]) + + writer.writerow(['Total', str(total_count)]) + + # SECTION 3: DOCUMENTS BY SOURCE + writer.writerow([]) + writer.writerow(['SECTION 3: DOCUMENTS BY SOURCE']) + writer.writerow([]) + + # List of new sources to track + new_sources = [ + "AIM: Aeronomy of Ice in the Mesosphere", + "ASDC: Atmospheric Science Data Center", + "G-LiHT", + "GENESIS: Global Environmental & Earth Science Information System", + "HyTES: Hyperspectral Thermal Emission Spectrometer", + "IBM-NASA Prithvi Models Family", + "COMET ASTEROID TELESCOPIC CATALOG HUNTER", + "Escape and Plasma Acceleration and Dynamics Explorers (ESCAPADE)", + "Extreme Ultraviolet Variability Experiment (EVE)", + "Crustal Dynamics Data Information System", + "Aura Atmospheric Chemistry", + "LDOPE: Land Data Operational Products Evaluation", + "CPL: Cloud Physics Lidar", + "Direct Readout Laboratory", + "Center for Near Earth Object Studies (CNEOS)" + ] + + # Initialize counter array for source documents + nsCounter = np.zeros((len(new_sources), 5, 5)) + + # Reset page counter for new query + pageCount = 1 + + print("Fetching source documents metrics...") + + while True: + payload = { + "app": 'nasa-sba-smd', + "query": { + "name": 'query-smd-primary', + "page": pageCount, + "pageSize": pageSize, + "scope": 'All', + "advanced": {}, + }, + } + + print(f"Processing page {pageCount} for source documents") + response_data = requests.post(url, headers={}, json=payload, verify=False).json() + + num_records = len(response_data.get('records', [])) + if num_records == 0: + break + + for record in response_data['records']: + if 'sourcestr56' in record: + source_name = record.get('treepath', [''])[0] + division = record.get('treepath', [''])[0] + doc_type = record.get('sourcestr56', '') + + for n, new_source in enumerate(new_sources): + if new_source in source_name: + for j, doc in enumerate(docArr): + if doc_type == doc: + for k, div in enumerate(divArr): + if div in division: + nsCounter[n, k, j] += 1 + + pageCount += 1 + + # Write source documents metrics + for n, source in enumerate(new_sources): + total_source_docs = np.sum(nsCounter[n, :, :]) + writer.writerow([f'Source: {source}']) + writer.writerow(['The total number of new documents:', str(int(total_source_docs))]) + + for j, doc_type in enumerate(docArr): + doc_count = np.sum(nsCounter[n, :, j]) + writer.writerow([f'The total number of new {doc_type.lower()} documents:', str(int(doc_count))]) + writer.writerow([]) + + writer.writerow(['DIVISION BREAKDOWN']) + writer.writerow([]) + + for d, division in enumerate(divArr): + div_name = division.strip('/') + total_docs = np.sum(nsCounter[:, d, :]) + writer.writerow([f'Division: {div_name}']) + writer.writerow(['The total number of new documents:', str(int(total_docs))]) + + for t, doc_type in enumerate(docArr): + doc_count = np.sum(nsCounter[:, d, t]) + writer.writerow([f'The total number of new {docArr[t].lower()} documents:', str(int(doc_count))]) + writer.writerow([]) + + total_overall = np.sum(nsCounter[:, :, :]) + writer.writerow(['All Divisions']) + writer.writerow(['The total number of new documents:', str(int(total_overall))]) + + for t, doc_type in enumerate(docArr): + total_doc_type = np.sum(nsCounter[:, :, t]) + writer.writerow([f'The total number of new {docArr[t].lower()} documents:', str(int(total_doc_type))]) + + # Move the file to its final location when completed + shutil.move(temp_file_path, final_file_path) + + print(f"Metrics generation complete. File saved to {final_file_path}") + return True + + except Exception as e: + print(f"Error in generate_metrics: {str(e)}") + return False diff --git a/sde_collections/templates/admin/sde_collections/collection/change_list.html b/sde_collections/templates/admin/sde_collections/collection/change_list.html new file mode 100644 index 00000000..0913672f --- /dev/null +++ b/sde_collections/templates/admin/sde_collections/collection/change_list.html @@ -0,0 +1,15 @@ +{% extends "admin/change_list.html" %} +{% load i18n %} + +{% block object-tools %} + +{% endblock %} From 80fd262bacb319abe64eebbb3136f97a0f21cbe0 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 31 Mar 2025 15:08:18 -0500 Subject: [PATCH 2/5] Added Cleanup Mechanism --- sde_collections/tasks.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 2ca2de06..b238abcc 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -237,13 +237,25 @@ def migrate_dump_to_delta_and_handle_status_transistions(collection_id): @celery_app.task(name="generate_metrics") def generate_metrics(task_id): """ - To asynchronously generate metrics + Asynchronously generate metrics and save to a downloadable file """ try: # Generate a file path in the media directory metrics_dir = os.path.join(settings.MEDIA_ROOT, 'metrics') os.makedirs(metrics_dir, exist_ok=True) + # Clean up old metrics files + for filename in os.listdir(metrics_dir): + if filename.startswith('metrics_') and (filename.endswith('.csv') or filename.endswith('.tmp')): + # Skip the current task's files + if not filename.startswith(f'metrics_{task_id}'): + file_path = os.path.join(metrics_dir, filename) + try: + os.remove(file_path) + print(f"Deleted old metrics file: {filename}") + except Exception as e: + print(f"Failed to delete {filename}: {str(e)}") + # Use a temporary file during generation temp_file_path = os.path.join(metrics_dir, f'metrics_{task_id}.tmp') final_file_path = os.path.join(metrics_dir, f'metrics_{task_id}.csv') From b7e666cabec9ef8ed9ef5281e2b8a58bbbd9ae87 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 31 Mar 2025 20:10:27 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sde_collections/admin.py | 70 +++-- sde_collections/tasks.py | 258 +++++++++--------- .../collection/change_list.html | 2 +- 3 files changed, 177 insertions(+), 153 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 8f897eea..0f51135f 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -1,13 +1,13 @@ import csv +import os +import uuid from django import forms +from django.conf import settings from django.contrib import admin, messages -from django.http import HttpResponse, FileResponse -from django.utils.safestring import mark_safe +from django.http import FileResponse, HttpResponse from django.urls import path -import uuid -import os -from django.conf import settings +from django.utils.safestring import mark_safe from sde_collections.models.delta_patterns import ( DeltaDivisionPattern, @@ -20,7 +20,7 @@ from .models.collection_choice_fields import TDAMMTags from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .tasks import fetch_full_text, import_candidate_urls_from_api, generate_metrics +from .tasks import fetch_full_text, generate_metrics, import_candidate_urls_from_api def fetch_and_replace_text_for_server(modeladmin, request, queryset, server_name): @@ -300,70 +300,82 @@ def included_curated_urls_count(self, obj) -> int: fetch_full_text_xli_action, ] ordering = ("cleaning_order",) - + def changelist_view(self, request, extra_context=None): """ To add a button for metrics download """ extra_context = extra_context or {} - extra_context['show_metrics_button'] = True - extra_context['metrics_url'] = request.path + 'metrics/' + extra_context["show_metrics_button"] = True + extra_context["metrics_url"] = request.path + "metrics/" return super().changelist_view(request, extra_context=extra_context) - + def get_urls(self): """ To add custom endpoints for metrics functionality """ urls = super().get_urls() custom_urls = [ - path('metrics/', self.admin_site.admin_view(self.download_metrics), name='sde_collections_collection_metrics'), - path('metrics//', self.admin_site.admin_view(self.get_metrics_file), name='sde_collections_get_metrics'), + path( + "metrics/", self.admin_site.admin_view(self.download_metrics), name="sde_collections_collection_metrics" + ), + path( + "metrics//", + self.admin_site.admin_view(self.get_metrics_file), + name="sde_collections_get_metrics", + ), ] return custom_urls + urls - + def download_metrics(self, request): """Custom view that starts metrics generation and returns to collection list""" task_id = str(uuid.uuid4()) task = generate_metrics.delay(task_id) - - download_url = request.path.rsplit('metrics/', 1)[0] + f'metrics/{task_id}/' - + + download_url = request.path.rsplit("metrics/", 1)[0] + f"metrics/{task_id}/" + messages.add_message( request, messages.INFO, - mark_safe(f"Metrics generation started. Please wait a moment and then click here to download when ready.") + mark_safe( + f"Metrics generation started. Please wait a moment and then click here to download when ready." + ), ) - return HttpResponse(status=303, headers={"Location": request.path.replace('/metrics/', '')}) - + return HttpResponse(status=303, headers={"Location": request.path.replace("/metrics/", "")}) + def get_metrics_file(self, request, task_id): """Serve the generated metrics file if it exists and is valid""" - - file_path = os.path.join(settings.MEDIA_ROOT, 'metrics', f'metrics_{task_id}.csv') - + + file_path = os.path.join(settings.MEDIA_ROOT, "metrics", f"metrics_{task_id}.csv") + # Create the retry URL current_url = request.build_absolute_uri() - + # Check if file exists and is not empty (minimum size 100 bytes) if os.path.exists(file_path) and os.path.getsize(file_path) > 100: - response = FileResponse(open(file_path, 'rb'), content_type='text/csv') - response['Content-Disposition'] = 'attachment; filename="metrics.csv"' + response = FileResponse(open(file_path, "rb"), content_type="text/csv") + response["Content-Disposition"] = 'attachment; filename="metrics.csv"' return response else: # Also check if there's a temporary file indicating task is still running - temp_file_path = os.path.join(settings.MEDIA_ROOT, 'metrics', f'metrics_{task_id}.tmp') + temp_file_path = os.path.join(settings.MEDIA_ROOT, "metrics", f"metrics_{task_id}.tmp") if os.path.exists(temp_file_path): messages.add_message( request, messages.INFO, - mark_safe(f"The metrics file is still being generated. Click here to try again.") + mark_safe( + f"The metrics file is still being generated. Click here to try again." + ), ) else: messages.add_message( request, messages.WARNING, - mark_safe(f"The metrics file is not ready yet. Click here to try again.") + mark_safe( + f"The metrics file is not ready yet. Click here to try again." + ), ) - return HttpResponse(status=303, headers={"Location": request.path.replace(f'/metrics/{task_id}/', '')}) + return HttpResponse(status=303, headers={"Location": request.path.replace(f"/metrics/{task_id}/", "")}) @admin.action(description="Exclude URL and all children") diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index b238abcc..859bdade 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -1,12 +1,12 @@ # /sde_collections/tasks.py +import csv import json import os import shutil -import csv -import requests -import numpy as np import boto3 +import numpy as np +import requests from django.apps import apps from django.conf import settings from django.core import management @@ -241,73 +241,79 @@ def generate_metrics(task_id): """ try: # Generate a file path in the media directory - metrics_dir = os.path.join(settings.MEDIA_ROOT, 'metrics') + metrics_dir = os.path.join(settings.MEDIA_ROOT, "metrics") os.makedirs(metrics_dir, exist_ok=True) - + # Clean up old metrics files for filename in os.listdir(metrics_dir): - if filename.startswith('metrics_') and (filename.endswith('.csv') or filename.endswith('.tmp')): + if filename.startswith("metrics_") and (filename.endswith(".csv") or filename.endswith(".tmp")): # Skip the current task's files - if not filename.startswith(f'metrics_{task_id}'): + if not filename.startswith(f"metrics_{task_id}"): file_path = os.path.join(metrics_dir, filename) try: os.remove(file_path) print(f"Deleted old metrics file: {filename}") except Exception as e: print(f"Failed to delete {filename}: {str(e)}") - + # Use a temporary file during generation - temp_file_path = os.path.join(metrics_dir, f'metrics_{task_id}.tmp') - final_file_path = os.path.join(metrics_dir, f'metrics_{task_id}.csv') - + temp_file_path = os.path.join(metrics_dir, f"metrics_{task_id}.tmp") + final_file_path = os.path.join(metrics_dir, f"metrics_{task_id}.csv") + # Initialize common variables - divArr = ['/Astrophysics/','/Biological and Physical Sciences/','/Earth Science/','/Heliophysics/','/Planetary Science/'] - docArr = ['Data','Images','Documentation','Software and Tools','Missions and Instruments'] - + divArr = [ + "/Astrophysics/", + "/Biological and Physical Sciences/", + "/Earth Science/", + "/Heliophysics/", + "/Planetary Science/", + ] + docArr = ["Data", "Images", "Documentation", "Software and Tools", "Missions and Instruments"] + url = "https://sciencediscoveryengine.nasa.gov/api/v1/search.query" - + # Create a temporary buffer for CSV data - with open(temp_file_path, 'w', newline='') as csvfile: + with open(temp_file_path, "w", newline="") as csvfile: writer = csv.writer(csvfile) - + # SECTION 1: DOCUMENT COUNT METRICS - writer.writerow(['SECTION 1: DOCUMENT COUNT BY DIVISION AND DOCUMENT TYPE']) + writer.writerow(["SECTION 1: DOCUMENT COUNT BY DIVISION AND DOCUMENT TYPE"]) writer.writerow([]) - + # Initialize document count array - recCounter = np.zeros((6,5)) - + recCounter = np.zeros((6, 5)) + # Process documents through API pageCount = 1 pageSize = 1000 - + print("Fetching document count metrics...") - + while True: payload = { - "app": 'nasa-sba-smd', + "app": "nasa-sba-smd", "query": { - "name": 'query-smd-primary', + "name": "query-smd-primary", "page": pageCount, "pageSize": pageSize, - "scope": 'All', + "scope": "All", "advanced": {}, }, } - + print(f"Processing page {pageCount} for document count") response_data = requests.post(url, headers={}, json=payload, verify=False).json() - - num_records = len(response_data.get('records', [])) + + num_records = len(response_data.get("records", [])) if num_records == 0: break - - for record in response_data['records']: - if 'sourcestr56' in record: + + for record in response_data["records"]: + if "sourcestr56" in record: # Extract division and document type - division = record.get('treepath', [''])[0] - doctype = record['sourcestr56'] - + division = record.get("treepath", [""])[0] + doctype = record["sourcestr56"] + # Update counters for j in range(0, len(docArr)): for k in range(0, len(divArr)): @@ -315,96 +321,102 @@ def generate_metrics(task_id): if divArr[k] in division: recCounter[k, j] += 1 recCounter[5, j] += 1 - + pageCount += 1 - + # Write document count metrics - writer.writerow(['Division', 'Document Type', 'Count']) - + writer.writerow(["Division", "Document Type", "Count"]) + # Add All Divisions data - writer.writerow(['All Divisions', 'Data', str(int(recCounter[5, 0]))]) - writer.writerow(['All Divisions', 'Images', str(int(recCounter[5, 1]))]) - writer.writerow(['All Divisions', 'Documentation', str(int(recCounter[5, 2]))]) - writer.writerow(['All Divisions', 'Software and Tools', str(int(recCounter[5, 3]))]) - writer.writerow(['All Divisions', 'Missions and Instruments', str(int(recCounter[5, 4]))]) - writer.writerow(['All Divisions', 'Total', str(int(np.sum(recCounter[5, :])))]) - + writer.writerow(["All Divisions", "Data", str(int(recCounter[5, 0]))]) + writer.writerow(["All Divisions", "Images", str(int(recCounter[5, 1]))]) + writer.writerow(["All Divisions", "Documentation", str(int(recCounter[5, 2]))]) + writer.writerow(["All Divisions", "Software and Tools", str(int(recCounter[5, 3]))]) + writer.writerow(["All Divisions", "Missions and Instruments", str(int(recCounter[5, 4]))]) + writer.writerow(["All Divisions", "Total", str(int(np.sum(recCounter[5, :])))]) + # Add data for each division - division_names = ['Astrophysics', 'Biological and Physical Sciences', 'Earth Science', 'Heliophysics', 'Planetary Science'] + division_names = [ + "Astrophysics", + "Biological and Physical Sciences", + "Earth Science", + "Heliophysics", + "Planetary Science", + ] for i, div_name in enumerate(division_names): - writer.writerow([div_name, 'Data', str(int(recCounter[i, 0]))]) - writer.writerow([div_name, 'Images', str(int(recCounter[i, 1]))]) - writer.writerow([div_name, 'Documentation', str(int(recCounter[i, 2]))]) - writer.writerow([div_name, 'Software and Tools', str(int(recCounter[i, 3]))]) - writer.writerow([div_name, 'Missions and Instruments', str(int(recCounter[i, 4]))]) - writer.writerow([div_name, 'Total', str(int(np.sum(recCounter[i, :])))]) - + writer.writerow([div_name, "Data", str(int(recCounter[i, 0]))]) + writer.writerow([div_name, "Images", str(int(recCounter[i, 1]))]) + writer.writerow([div_name, "Documentation", str(int(recCounter[i, 2]))]) + writer.writerow([div_name, "Software and Tools", str(int(recCounter[i, 3]))]) + writer.writerow([div_name, "Missions and Instruments", str(int(recCounter[i, 4]))]) + writer.writerow([div_name, "Total", str(int(np.sum(recCounter[i, :])))]) + # SECTION 2: SOURCES COUNT writer.writerow([]) - writer.writerow(['SECTION 2: SOURCES COUNT BY DIVISION']) + writer.writerow(["SECTION 2: SOURCES COUNT BY DIVISION"]) writer.writerow([]) - + # Initialize sources and divisions lists sources = [] divs = [] - + # Reset page counter for new query pageCount = 1 - + print("Fetching sources count metrics...") - + while True: payload = { - "app": 'nasa-sba-smd', + "app": "nasa-sba-smd", "query": { - "name": 'query-smd-primary', + "name": "query-smd-primary", "page": pageCount, "pageSize": pageSize, - "scope": 'All', + "scope": "All", "advanced": {}, }, } - + print(f"Processing page {pageCount} for sources count") response_data = requests.post(url, headers={}, json=payload, verify=False).json() - - num_records = len(response_data.get('records', [])) + + num_records = len(response_data.get("records", [])) if num_records == 0: break - - for record in response_data['records']: - if 'sourcestr56' in record: - source_name = record.get('collection', [''])[0] - division = record.get('treepath', [''])[0] - + + for record in response_data["records"]: + if "sourcestr56" in record: + source_name = record.get("collection", [""])[0] + division = record.get("treepath", [""])[0] + if source_name not in sources: sources.append(source_name) divs.append(division) - + pageCount += 1 - + # Count sources by division division_counts = {div: 0 for div in divArr} total_count = len(sources) - + for i, div_path in enumerate(divs): for div in divArr: if div in div_path: division_counts[div] += 1 - + # Write sources count metrics - writer.writerow(['Division', 'Count']) - + writer.writerow(["Division", "Count"]) + for div in divArr: - writer.writerow([div.strip('/'), str(division_counts[div])]) - - writer.writerow(['Total', str(total_count)]) - + writer.writerow([div.strip("/"), str(division_counts[div])]) + + writer.writerow(["Total", str(total_count)]) + # SECTION 3: DOCUMENTS BY SOURCE writer.writerow([]) - writer.writerow(['SECTION 3: DOCUMENTS BY SOURCE']) + writer.writerow(["SECTION 3: DOCUMENTS BY SOURCE"]) writer.writerow([]) - + # List of new sources to track new_sources = [ "AIM: Aeronomy of Ice in the Mesosphere", @@ -421,42 +433,42 @@ def generate_metrics(task_id): "LDOPE: Land Data Operational Products Evaluation", "CPL: Cloud Physics Lidar", "Direct Readout Laboratory", - "Center for Near Earth Object Studies (CNEOS)" + "Center for Near Earth Object Studies (CNEOS)", ] - + # Initialize counter array for source documents nsCounter = np.zeros((len(new_sources), 5, 5)) - + # Reset page counter for new query pageCount = 1 - + print("Fetching source documents metrics...") - + while True: payload = { - "app": 'nasa-sba-smd', + "app": "nasa-sba-smd", "query": { - "name": 'query-smd-primary', + "name": "query-smd-primary", "page": pageCount, "pageSize": pageSize, - "scope": 'All', + "scope": "All", "advanced": {}, }, } - + print(f"Processing page {pageCount} for source documents") response_data = requests.post(url, headers={}, json=payload, verify=False).json() - - num_records = len(response_data.get('records', [])) + + num_records = len(response_data.get("records", [])) if num_records == 0: break - - for record in response_data['records']: - if 'sourcestr56' in record: - source_name = record.get('treepath', [''])[0] - division = record.get('treepath', [''])[0] - doc_type = record.get('sourcestr56', '') - + + for record in response_data["records"]: + if "sourcestr56" in record: + source_name = record.get("treepath", [""])[0] + division = record.get("treepath", [""])[0] + doc_type = record.get("sourcestr56", "") + for n, new_source in enumerate(new_sources): if new_source in source_name: for j, doc in enumerate(docArr): @@ -464,48 +476,48 @@ def generate_metrics(task_id): for k, div in enumerate(divArr): if div in division: nsCounter[n, k, j] += 1 - + pageCount += 1 - + # Write source documents metrics for n, source in enumerate(new_sources): total_source_docs = np.sum(nsCounter[n, :, :]) - writer.writerow([f'Source: {source}']) - writer.writerow(['The total number of new documents:', str(int(total_source_docs))]) - + writer.writerow([f"Source: {source}"]) + writer.writerow(["The total number of new documents:", str(int(total_source_docs))]) + for j, doc_type in enumerate(docArr): doc_count = np.sum(nsCounter[n, :, j]) - writer.writerow([f'The total number of new {doc_type.lower()} documents:', str(int(doc_count))]) + writer.writerow([f"The total number of new {doc_type.lower()} documents:", str(int(doc_count))]) writer.writerow([]) - - writer.writerow(['DIVISION BREAKDOWN']) + + writer.writerow(["DIVISION BREAKDOWN"]) writer.writerow([]) - + for d, division in enumerate(divArr): - div_name = division.strip('/') + div_name = division.strip("/") total_docs = np.sum(nsCounter[:, d, :]) - writer.writerow([f'Division: {div_name}']) - writer.writerow(['The total number of new documents:', str(int(total_docs))]) - + writer.writerow([f"Division: {div_name}"]) + writer.writerow(["The total number of new documents:", str(int(total_docs))]) + for t, doc_type in enumerate(docArr): doc_count = np.sum(nsCounter[:, d, t]) - writer.writerow([f'The total number of new {docArr[t].lower()} documents:', str(int(doc_count))]) + writer.writerow([f"The total number of new {docArr[t].lower()} documents:", str(int(doc_count))]) writer.writerow([]) - + total_overall = np.sum(nsCounter[:, :, :]) - writer.writerow(['All Divisions']) - writer.writerow(['The total number of new documents:', str(int(total_overall))]) - + writer.writerow(["All Divisions"]) + writer.writerow(["The total number of new documents:", str(int(total_overall))]) + for t, doc_type in enumerate(docArr): total_doc_type = np.sum(nsCounter[:, :, t]) - writer.writerow([f'The total number of new {docArr[t].lower()} documents:', str(int(total_doc_type))]) - + writer.writerow([f"The total number of new {docArr[t].lower()} documents:", str(int(total_doc_type))]) + # Move the file to its final location when completed shutil.move(temp_file_path, final_file_path) - + print(f"Metrics generation complete. File saved to {final_file_path}") return True - + except Exception as e: print(f"Error in generate_metrics: {str(e)}") return False diff --git a/sde_collections/templates/admin/sde_collections/collection/change_list.html b/sde_collections/templates/admin/sde_collections/collection/change_list.html index 0913672f..616c0165 100644 --- a/sde_collections/templates/admin/sde_collections/collection/change_list.html +++ b/sde_collections/templates/admin/sde_collections/collection/change_list.html @@ -12,4 +12,4 @@ {{ block.super }} {% endblock %} -{% endblock %} +{% endblock %} From 922e90eda72418de2dc97b487aae2a318d681e6d Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 31 Mar 2025 15:58:46 -0500 Subject: [PATCH 4/5] Added chagelog + other minor fixes --- CHANGELOG.md | 9 +++++++++ requirements/base.txt | 1 + sde_collections/admin.py | 17 +++++++++++++---- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ab909b0..a12582a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -183,3 +183,12 @@ For each PR made, an entry should be added to this changelog. It should contain - physics_of_the_cosmos - stsci_space_telescope_science_institute - Once the front end has been updated to allow for tag edits, all astrophysics collections will be marked to be run through the pipeline + +- 1295-asynchronous-metrics-download-in-admin-panel + - Description: Implemented asynchronous metrics download in Django admin + - Changes: + - Button Addition:Integrated a 'metrics' button + - Task Generation: download_metrics handles button clicks to initiate a Celery task, display a download link and manage redirection. + - Cleanup Mechanism: Cleans up old metrics files in the directory, keeping only current task related files. + - Background Processing: Runs generate_metrics task asynchronously to gather data and generate a CSV in MEDIA_ROOT/metrics/ + - File Retrieval: get_metrics_file checks file availability and size, providing a download if ready or status messages for in progress files. diff --git a/requirements/base.txt b/requirements/base.txt index a4ca9cf8..0b81d058 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -34,3 +34,4 @@ tenacity==8.2.2 tqdm==4.66.3 unidecode==1.3.8 xmltodict==0.13.0 +numpy==1.24.3 diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 8f897eea..dfb00fd5 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -324,14 +324,17 @@ def get_urls(self): def download_metrics(self, request): """Custom view that starts metrics generation and returns to collection list""" task_id = str(uuid.uuid4()) - task = generate_metrics.delay(task_id) + generate_metrics.delay(task_id) download_url = request.path.rsplit('metrics/', 1)[0] + f'metrics/{task_id}/' messages.add_message( request, messages.INFO, - mark_safe(f"Metrics generation started. Please wait a moment and then click here to download when ready.") + mark_safe( + f"Metrics generation started. Please wait a moment and then " + f"click here to download when ready." + ) ) return HttpResponse(status=303, headers={"Location": request.path.replace('/metrics/', '')}) @@ -355,13 +358,19 @@ def get_metrics_file(self, request, task_id): messages.add_message( request, messages.INFO, - mark_safe(f"The metrics file is still being generated. Click here to try again.") + mark_safe( + f"The metrics file is still being generated. " + f"Click here to try again." + ) ) else: messages.add_message( request, messages.WARNING, - mark_safe(f"The metrics file is not ready yet. Click here to try again.") + mark_safe( + f"The metrics file is not ready yet. " + f"Click here to try again." + ) ) return HttpResponse(status=303, headers={"Location": request.path.replace(f'/metrics/{task_id}/', '')}) From 45578877041b08810b2930af530961161f8d8e70 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 31 Mar 2025 21:01:37 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sde_collections/admin.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index cadf9f2c..5a45bc60 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -331,16 +331,16 @@ def download_metrics(self, request): """Custom view that starts metrics generation and returns to collection list""" task_id = str(uuid.uuid4()) generate_metrics.delay(task_id) - - download_url = request.path.rsplit('metrics/', 1)[0] + f'metrics/{task_id}/' - + + download_url = request.path.rsplit("metrics/", 1)[0] + f"metrics/{task_id}/" + messages.add_message( request, messages.INFO, mark_safe( f"Metrics generation started. Please wait a moment and then " f"click here to download when ready." - ) + ), ) return HttpResponse(status=303, headers={"Location": request.path.replace("/metrics/", "")}) @@ -367,16 +367,15 @@ def get_metrics_file(self, request, task_id): mark_safe( f"The metrics file is still being generated. " f"Click here to try again." - ) + ), ) else: messages.add_message( request, messages.WARNING, mark_safe( - f"The metrics file is not ready yet. " - f"Click here to try again." - ) + f"The metrics file is not ready yet. " f"Click here to try again." + ), ) return HttpResponse(status=303, headers={"Location": request.path.replace(f"/metrics/{task_id}/", "")})