Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
45c1d77
feat: added celery export occurrence task
mohamedelabbas1996 Feb 17, 2025
f6871ea
feat: added export & export_status endpoints
mohamedelabbas1996 Feb 17, 2025
b3e448d
added migration files
mohamedelabbas1996 Feb 17, 2025
bb745f6
fixed migration conflict
mohamedelabbas1996 Feb 17, 2025
518b8df
fix: disabled pagination for export action
mohamedelabbas1996 Feb 18, 2025
b3b4369
Merge branch 'main' into feat/export-occurrences-data
mohamedelabbas1996 Feb 18, 2025
8d98759
fix: merged migrations
mohamedelabbas1996 Feb 18, 2025
21470b9
Merge branch 'main' into feat/export-occurrences-data
mohamedelabbas1996 Feb 23, 2025
a8673af
feat: added DataExport Job Type
mohamedelabbas1996 Feb 24, 2025
523d177
Implemented JSON export for occurrence data
mohamedelabbas1996 Mar 4, 2025
ac7cfbc
Merge branch 'main' into feat/export-occurrences-data
mohamedelabbas1996 Mar 4, 2025
04ab2cf
feat: Added support for csv file format
mohamedelabbas1996 Mar 4, 2025
e4599b9
Merge branch 'main' into feat/export-occurrences-data
mohamedelabbas1996 Mar 6, 2025
94cc7a3
chore: Moved export actions to a separate view under the exports app
mohamedelabbas1996 Mar 6, 2025
ed3960a
Merge branch 'main' of github.com:RolnickLab/antenna into feat/export…
mihow Mar 7, 2025
c4c9820
chore: ignore unresolvable type errors
mihow Mar 7, 2025
5dbc002
chore: remove dependencies for darwincore export in this PR
mihow Mar 7, 2025
a86a348
fix: use mixin for get_active_project
mihow Mar 7, 2025
57c5905
feat: register export views in api router
mihow Mar 7, 2025
e0df304
feat: Implemented Data Export Framework & Occurrence Exports
mohamedelabbas1996 Mar 10, 2025
8be00cd
Merge branch 'feat/export-occurrences-data' of https://github.com/Rol…
mohamedelabbas1996 Mar 10, 2025
b297a84
feat: Added more fields to the OccurrenceTabularSerializer
mohamedelabbas1996 Mar 11, 2025
d8d3b5d
Merge branch 'main' into feat/export-occurrences-data
mohamedelabbas1996 Mar 11, 2025
1270fd1
Merge branch 'main' into feat/export-occurrences-data
mohamedelabbas1996 Mar 17, 2025
44c3ca8
Refactor DataExport Model and API & Admin Integration
mohamedelabbas1996 Mar 17, 2025
95e6e86
Merge branch 'feat/export-occurrences-data' of https://github.com/Rol…
mohamedelabbas1996 Mar 17, 2025
8a02b3d
Removed DataExport status field
mohamedelabbas1996 Mar 17, 2025
c8f5d3e
chore: Raise NotImplemented for abstract methods
mohamedelabbas1996 Mar 17, 2025
349925a
Brought back DataExport file_url field
mohamedelabbas1996 Mar 17, 2025
e0321bd
Merge branch 'main' into feat/export-occurrences-data
mohamedelabbas1996 Mar 19, 2025
45485b2
Refactor Data Export: Improve Filtering, Naming, and JSON Validity
mohamedelabbas1996 Mar 20, 2025
4105177
Merge branch 'main' into feat/export-occurrences-data
mohamedelabbas1996 Mar 20, 2025
3c9aca2
Merge branch 'feat/export-occurrences-data' of https://github.com/Rol…
mohamedelabbas1996 Mar 20, 2025
543a142
fix: Added missing migration file
mohamedelabbas1996 Mar 20, 2025
95745d7
fix: Added missing migration file
mohamedelabbas1996 Mar 20, 2025
25896b7
Merge branch 'main' into feat/export-occurrences-data
annavik Mar 21, 2025
f14653f
fix: tweak labels to be sentence case
annavik Mar 21, 2025
43e8835
fix: update CSV export field from verification -> verification_status
annavik Mar 21, 2025
f836bfa
Improve DataExport handling, filtering, and cleanup logic
mohamedelabbas1996 Mar 24, 2025
3d0514a
Merge branch 'feat/export-occurrences-data' of https://github.com/Rol…
mohamedelabbas1996 Mar 24, 2025
1e879e4
test: multiple methods of nesting related obj data for exports
mihow Mar 25, 2025
4d48622
feat: return absolute urls for export files
mihow Mar 25, 2025
747708c
Merge branch 'main' into feat/export-occurrences-data
mihow Mar 25, 2025
2478789
Merge branch 'main' into feat/export-occurrences-data
mohamedelabbas1996 Mar 28, 2025
cd2f57c
Refactor Export Logic and Add Export Stats
mohamedelabbas1996 Mar 28, 2025
4bae6c7
Merge branch 'feat/export-occurrences-data' of https://github.com/Rol…
mohamedelabbas1996 Mar 28, 2025
26181d0
Enhance Export Details
mohamedelabbas1996 Mar 31, 2025
eded961
fix: make summary count consistent with exports
mihow Apr 1, 2025
02dd4b7
feat: update and return total record count before starting export
mihow Apr 1, 2025
058f93e
feat: update total record count before exporting first batch
mihow Apr 2, 2025
b20a851
feat: lower batch size for exports to increase update frequency
mihow Apr 2, 2025
a518a74
chore: reset all migrations to main
mihow Apr 3, 2025
0b06579
chore: recreate migrations
mihow Apr 3, 2025
ee34d2c
chore: moved export format validation logic to the serializer
mohamedelabbas1996 Apr 4, 2025
0900bb0
chore: changed collection filter param name to collection_id
mohamedelabbas1996 Apr 4, 2025
a1eb605
Merge branch 'feat/export-occurrences-data' of https://github.com/Rol…
mohamedelabbas1996 Apr 4, 2025
faeb081
Merge branch 'main' of github.com:RolnickLab/antenna into feat/export…
mihow Apr 8, 2025
6a50eed
chore: fix type hints
mihow Apr 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions ami/main/api/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import logging

from celery import shared_task
from django.core.files.storage import default_storage
from django.core.mail import send_mail

from ami.main.models import ExportHistory, Occurrence
from ami.utils.exports import create_dwc_archive
from config.settings.local import DEFAULT_FROM_EMAIL

logger = logging.getLogger(__name__)


@shared_task(bind=True)
def export_occurrences_task(self, occurrence_ids=None, user_email=None, base_url=None):
"""
Celery task for exporting occurrences asynchronously to MinIO.
"""

try:
occurrences = Occurrence.objects.filter(id__in=occurrence_ids)
file_path = create_dwc_archive(occurrences)
task_id = self.request.id
# Generate a unique filename for MinIO storage
file_name = f"{task_id}.zip"
minio_path = f"exports/{file_name}" # Save under 'exports/' folder in MinIO

# Upload file to MinIO storage
with open(file_path, "rb") as f:
default_storage.save(minio_path, f)

# Get public URL of the stored file
file_url = f"{base_url}{default_storage.url(minio_path)}"
logger.info(f"Export completed: {file_url}")
# Update export history
ExportHistory.objects.filter(task_id=task_id).update(status="completed", file_url=file_url)
send_mail(
subject="Your Occurrence Export is Ready!",
message=f"""Hello,\n\nYour occurrence data export is complete!
You can download the file here:\n{file_url}\n\nThank you!""",
from_email=DEFAULT_FROM_EMAIL,
recipient_list=[user_email],
fail_silently=False,
)
logger.info(f"Email sent to {user_email} with download link.")
return {"status": "completed", "file_url": file_url}

except Exception as e:
logger.error(f"Export failed: {str(e)}")
ExportHistory.objects.filter(task_id=self.request.id).update(status="failed")
self.retry(exc=e, countdown=60, max_retries=3) # Retry up to 3 times
88 changes: 87 additions & 1 deletion ami/main/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import logging
from statistics import mode

from celery.result import AsyncResult
from django.contrib.postgres.search import TrigramSimilarity
from django.core import exceptions
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from django.db import models
from django.db.models import Prefetch
from django.db.models.query import QuerySet
Expand All @@ -26,6 +29,8 @@
from ami.base.pagination import LimitOffsetPaginationWithPermissions
from ami.base.permissions import IsActiveStaffOrReadOnly
from ami.base.serializers import FilterParamsSerializer, SingleParamSerializer
from ami.main.api.tasks import export_occurrences_task
from ami.utils.exports import create_dwc_archive
from ami.utils.requests import get_active_classification_threshold, get_active_project, project_id_doc_param
from ami.utils.storages import ConnectionTestResult

Expand All @@ -35,6 +40,7 @@
Detection,
Device,
Event,
ExportHistory,
Identification,
Occurrence,
Page,
Expand Down Expand Up @@ -998,6 +1004,7 @@ def get_serializer_class(self):
return OccurrenceSerializer

def get_queryset(self) -> QuerySet:
logger.info(f"OccurrenceViewset action : {self.action}")
project = get_active_project(self.request)
qs = super().get_queryset()
if project:
Expand All @@ -1010,7 +1017,7 @@ def get_queryset(self) -> QuerySet:
qs = qs.with_detections_count().with_timestamps() # type: ignore
qs = qs.with_identifications() # type: ignore

if self.action == "list":
if self.action == "list" or self.action == "export":
qs = (
qs.all()
.exclude(detections=None)
Expand All @@ -1033,6 +1040,85 @@ def get_queryset(self) -> QuerySet:
def list(self, request, *args, **kwargs):
return super().list(request, *args, **kwargs)

def paginate_queryset(self, queryset):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may be more scalable to keep pagination, but automatically loop through all the pages. Rather than triggering a single huge database query. Or another way to break it apart? I can give you a large DB snapshot to test on.

Copy link
Contributor Author

@mohamedelabbas1996 mohamedelabbas1996 Feb 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would appreciate it very much if I 've access to the DB snapshot.

"""
Override pagination to skip pagination for 'export' action.
"""

if self.action == "export":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm tempted to create a new set of views in the exports app (exports/views.py). Many of the exports will contain multiple models (occurrences & events). It may be harder to reuse filters from the Occurrences view, but they could be moved to QuerySet methods. Also the custom export views can be simpler, since they will all be read-only list views (no CRUD needed). Thoughts?

Copy link
Contributor Author

@mohamedelabbas1996 mohamedelabbas1996 Mar 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From a code modularity and separation of concerns perspective, I agree that having separate views for exports makes the code cleaner and less cluttered. That approach also simplifies maintenance since export-related logic won't be mixed into the main Occurrence views.

As for filters, I wanted to bring up a related issue: how do we efficiently pass a filtered queryset to a job object? can we consider adding a queryset parameter directly to the job object? or just use stage parameters?
Would love to hear your thoughts on this.

return None # Disable pagination, return full queryset

return super().paginate_queryset(queryset) # Apply normal pagination

@action(detail=False, methods=["post"])
def export(self, request):
"""
Trigger occurrence export via Celery, passing only filtered occurrence IDs.
"""
query_set = self.get_queryset()
occurrence_ids = list(query_set.values_list("id", flat=True)) # Extract IDs only

logger.info(f"OccurrenceViewSet.export - Exporting {len(occurrence_ids)} occurrences")
base_url = request.build_absolute_uri("/").rstrip("/") # Get the full domain name
# Trigger Celery task with occurrence IDs
task = export_occurrences_task.apply_async(
kwargs={"occurrence_ids": occurrence_ids, "user_email": request.user.email, "base_url": base_url}
)
# Save export history
ExportHistory.objects.create(user=request.user, task_id=task.id, status="pending")

return Response({"task_id": task.id})

@action(detail=False, methods=["get"])
def export_status(self, request):
"""
Check export task status.
"""
task_id = request.query_params.get("task_id")
if not task_id:
return Response({"error": "task_id is required"}, status=400)

task = AsyncResult(task_id)
# Handle case where task ID does not exist in Celery
if task.state is None or task.result is None:
return Response({"error": "Invalid or unknown task ID"}, status=404)
if task.state == "PENDING":
return Response({"status": "pending"})
elif task.state == "SUCCESS":
return Response({"status": "completed", "file_url": task.result.get("file_url")})
elif task.state == "FAILURE":
return Response({"status": "failed", "error": str(task.result)})
else:
return Response({"status": task.state})

@action(detail=False, methods=["post"])
def export_test(self, request):
"""
Synchronous test endpoint to generate a DwC-A archive instantly.
"""
query_set = self.get_queryset()

if not query_set.exists():
return Response({"error": "No occurrences found to export."}, status=status.HTTP_400_BAD_REQUEST)

archive_path = create_dwc_archive(query_set)
logger.info(f"Test export created: {archive_path}")
# Generate a unique filename for MinIO (use task ID or timestamp)
import datetime

now = datetime.datetime.now()
now = str(now)
file_name = f"exports/dwca_{now}.zip"

# Upload to MinIO storage
with open(archive_path, "rb") as archive_file:
default_storage.save(file_name, ContentFile(archive_file.read()))

# Get MinIO file URL
file_url = default_storage.url(file_name)

return Response({"message": "Export completed successfully", "file_url": file_url})


class TaxonViewSet(DefaultViewSet):
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Generated by Django 4.2.10 on 2025-02-17 01:16

from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("main", "0044_merge_20250124_2333"),
]

operations = [
migrations.AlterField(
model_name="classification",
name="algorithm",
field=models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="classifications",
to="ml.algorithm",
),
),
migrations.CreateModel(
name="ExportHistory",
fields=[
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
("task_id", models.CharField(max_length=255, unique=True)),
(
"status",
models.CharField(
choices=[("pending", "Pending"), ("completed", "Completed"), ("failed", "Failed")],
default="pending",
max_length=10,
),
),
("file_url", models.URLField(blank=True, null=True)),
(
"user",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="exports",
to=settings.AUTH_USER_MODEL,
),
),
],
options={
"abstract": False,
},
),
]
18 changes: 18 additions & 0 deletions ami/main/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3097,3 +3097,21 @@ def get_or_create_starred_collection(cls, project: Project) -> "SourceImageColle
name="Starred Images", # @TODO make this translatable
)
return collection


class ExportHistory(BaseModel):
"""A model to track Occurrence data exports"""

STATUS_CHOICES = [
("pending", "Pending"),
("completed", "Completed"),
("failed", "Failed"),
]

user = models.ForeignKey(User, on_delete=models.CASCADE, related_name="exports")
task_id = models.CharField(max_length=255, unique=True)
status = models.CharField(max_length=10, choices=STATUS_CHOICES, default="pending")
file_url = models.URLField(blank=True, null=True)

def __str__(self):
return f"Export {self.task_id} - {self.status}"
16 changes: 16 additions & 0 deletions ami/ml/migrations/0017_alter_algorithm_unique_together.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Generated by Django 4.2.10 on 2025-02-17 03:18

from django.db import migrations


class Migration(migrations.Migration):
dependencies = [
("ml", "0016_merge_20250117_2101"),
]

operations = [
migrations.AlterUniqueTogether(
name="algorithm",
unique_together={("name", "version")},
),
]
Loading