From 73235322fc5c1f7e5812875c9539fc9e73f91bd6 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 16 Oct 2024 18:16:32 -0700 Subject: [PATCH 01/11] feat: enable viewing occurrences from a collection --- ami/main/api/views.py | 2 + ami/main/models.py | 10 +---- .../collection-details/capture-columns.tsx | 9 ++++- .../collections/collection-columns.tsx | 38 ++++++++++++++++++- ui/src/utils/useFilters.ts | 4 ++ 5 files changed, 53 insertions(+), 10 deletions(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index bc6ed96d1..75acc8a11 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -544,6 +544,8 @@ class SourceImageCollectionViewSet(DefaultViewSet): SourceImageCollection.objects.all() .with_source_images_count() # type: ignore .with_source_images_with_detections_count() + .with_occurrences_count() + .with_taxa_count() .prefetch_related("jobs") ) serializer_class = SourceImageCollectionSerializer diff --git a/ami/main/models.py b/ami/main/models.py index 84a7e3bf1..d694c6e43 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -2676,25 +2676,19 @@ def with_source_images_with_detections_count(self): ) ) - def with_occurrences_count(self, classification_threshold: float = 0): + def with_occurrences_count(self): return self.annotate( occurrences_count=models.Count( "images__detections__occurrence", - filter=models.Q( - images__detections__occurrence__determination_score__gte=classification_threshold, - ), distinct=True, ) ) - def with_taxa_count(self, classification_threshold: float = 0): + def with_taxa_count(self): return self.annotate( taxa_count=models.Count( "images__detections__occurrence__determination", distinct=True, - filter=models.Q( - images__detections__occurrence__determination_score__gte=classification_threshold, - ), ) ) diff --git a/ui/src/pages/collection-details/capture-columns.tsx b/ui/src/pages/collection-details/capture-columns.tsx index 8516f09cf..f6f72b40c 100644 --- a/ui/src/pages/collection-details/capture-columns.tsx +++ b/ui/src/pages/collection-details/capture-columns.tsx @@ -89,7 +89,14 @@ export const columns: (projectId: string) => TableColumn[] = ( textAlign: TextAlign.Right, }, renderCell: (item: Capture) => ( - + + + ), }, { diff --git a/ui/src/pages/overview/collections/collection-columns.tsx b/ui/src/pages/overview/collections/collection-columns.tsx index c3cd7ddab..e94bef907 100644 --- a/ui/src/pages/overview/collections/collection-columns.tsx +++ b/ui/src/pages/overview/collections/collection-columns.tsx @@ -50,7 +50,43 @@ export const columns: (projectId: string) => TableColumn[] = ( textAlign: TextAlign.Right, }, renderCell: (item: Collection) => ( - + + ), + }, + { + id: 'occurrences', + name: translate(STRING.FIELD_LABEL_OCCURRENCES), + sortField: 'occurrences_count', + styles: { + textAlign: TextAlign.Right, + }, + renderCell: (item: Collection) => ( + + + + ), + }, + { + id: 'taxa', + name: translate(STRING.FIELD_LABEL_SPECIES), + sortField: 'taxa_count', + styles: { + textAlign: TextAlign.Right, + }, + renderCell: (item: Collection) => ( + + + ), }, { diff --git a/ui/src/utils/useFilters.ts b/ui/src/utils/useFilters.ts index 7a8b5e4ea..03d2b24f4 100644 --- a/ui/src/utils/useFilters.ts +++ b/ui/src/utils/useFilters.ts @@ -65,6 +65,10 @@ export const AVAILABLE_FILTERS = [ label: 'Verification status', field: 'verified', }, + { + label: 'Capture collection', + field: 'collection', + }, ] export const useFilters = (defaultFilters?: { [field: string]: string }) => { From 88136e9c26ccb797b46781def5daa525f1f22445 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 18 Oct 2024 09:15:04 -0700 Subject: [PATCH 02/11] feat: test approaches for data export --- ami/exports/apps.py | 7 ++ ami/exports/base.py | 64 ++++++++++ ami/exports/by_capture.py | 114 ++++++++++++++++++ .../management/commands/export_by_capture.py | 34 ++++++ ami/exports/models.py | 0 .../management/commands/export_occurrences.py | 94 +++++++++++++++ config/settings/base.py | 5 +- 7 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 ami/exports/apps.py create mode 100644 ami/exports/base.py create mode 100644 ami/exports/by_capture.py create mode 100644 ami/exports/management/commands/export_by_capture.py create mode 100644 ami/exports/models.py create mode 100644 ami/main/management/commands/export_occurrences.py diff --git a/ami/exports/apps.py b/ami/exports/apps.py new file mode 100644 index 000000000..d3d96b498 --- /dev/null +++ b/ami/exports/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig +from django.utils.translation import gettext_lazy as _ + + +class MainConfig(AppConfig): + name = "ami.exports" + verbose_name = _("Data Exports & Reports") diff --git a/ami/exports/base.py b/ami/exports/base.py new file mode 100644 index 000000000..6aac9c93e --- /dev/null +++ b/ami/exports/base.py @@ -0,0 +1,64 @@ +import csv +import json +import logging + +from django.core.cache import cache +from django.core.files.storage import default_storage +from django.utils import timezone +from django.utils.text import slugify +from rest_framework import serializers +from rest_framework.views import APIView + +logger = logging.getLogger(__name__) + + +class BaseExportSerializer(serializers.Serializer): + """ + Base serializer for exporting data in various formats, from multiple models. + """ + + pass + + +class BaseExportView(APIView): + """ + Read-only API view for exporting data in various formats, from multiple models. + """ + + pass + + +def write_export(report_name, Serializer, get_data_batch_function, format="csv"): + timestamp = timezone.now().strftime("%Y%m%d-%H%M%S") + file_name = f"{slugify(report_name)}-{timestamp}.{format}" + file_path = f"exports/{file_name}" + + try: + with default_storage.open(file_path, "w") as file: + if format == "csv": + writer = csv.writer(file) + writer.writerow(Serializer().fields.keys()) # Write header + for batch in get_data_batch_function(): + serializer = Serializer(batch, many=True) + for row in serializer.data: + writer.writerow(row.values()) + else: # JSON + file.write("[") + first = True + for batch in get_data_batch_function(report_name): + serializer = Serializer(batch, many=True) + for item in serializer.data: + if not first: + file.write(",") + json.dump(item, file) + first = False + file.write("]") + + # Cache the file path + cache.set(f"export_{report_name}_{format}", file_path, 3600) # Cache for 1 hour + + logger.info(f"Export generated successfully: {file_path}") + return file_path + except Exception as e: + logger.error(f"Error generating export: {str(e)}") + raise diff --git a/ami/exports/by_capture.py b/ami/exports/by_capture.py new file mode 100644 index 000000000..5ed583b33 --- /dev/null +++ b/ami/exports/by_capture.py @@ -0,0 +1,114 @@ +# Views, serializers and queries for the by_capture export type + +""" +This export should contain the following fields: + +- Capture ID +- Date Observed +- Time Observed +- Latitude +- Longitude +- Taxon ID (include not-moth) +- Count (count of this taxon in one image) +- Taxon scientific name +- Taxon rank +- Taxon specific epithet +- Taxon genus +- Taxon family +- Softmax score +- Num detections (in same capture) +- Station Name +- Session ID +- Session Start Date +- Session duration +- Device ID +- Detection algorithm ID +- Moth/Not moth classifier algorithm ID +- Species Classification Algorithm ID +- Verification user IDs +- Verified +- Verified on +""" + +import logging + +from django.db import models +from rest_framework import serializers + +from ami.main.models import Detection + +logger = logging.getLogger(__name__) + + +class DetectionsByDeterminatinonAndCaptureSerializer(serializers.Serializer): + """ + Specify the field names, order of fields, and the format of each field value for the export. + """ + + capture_id = serializers.IntegerField(source="source_image_id") + # date_observed = serializers.DateField() + # time_observed = serializers.TimeField() + # latitude = serializers.FloatField() + # longitude = serializers.FloatField() + # taxon_id = serializers.IntegerField() + # taxon_scientific_name = serializers.CharField() + + +def get_queryset(): + return ( + Detection.objects.all() + .select_related( + "occurrence", + "source_image", + ) + .prefetch_related() + .values( + "occurrence_id", + "source_image_id", + "source_image__timestamp", + "source_image__deployment__latitude", + "source_image__deployment__longitude", + "occurrence__determination_id", + "occurrence__determination_score", + ) + .annotate( + taxon_scientific_name=models.F("occurrence__determination__display_name"), + taxon_rank=models.F("occurrence__determination__rank"), + # taxon_family=F("determination__family"), + # num_detections=Count("occurrence__detections"), + # verification_user_ids=F("occurrence__source_image__collection__session__device__verification_users"), + ) + ) + + +def get_data_in_batches(batch_size=1000): + QuerySet = get_queryset() + items = QuerySet.iterator(chunk_size=batch_size) + batch = [] + logger.info(f"QuerySet: {QuerySet}") + for i, item in enumerate(items): + # logger.info(f"Processing item {i}") + try: + # item_data = { + # "user_id": item.id, + # "username": item.username, + # "email": item.email, + # "total_orders": Order.objects.filter(user=item).count(), + # "total_spent": Order.objects.filter(user=item).aggregate(total=Sum("total_amount"))["total"] or 0, + # } + item_data = item + serializer = DetectionsByDeterminatinonAndCaptureSerializer(data=item_data) + if serializer.is_valid(): + batch.append(serializer.validated_data) + else: + logger.warning(f"Invalid data for item {i}: {serializer.errors}") + logger.info(item_data) + + if len(batch) >= batch_size: + logger.info(f"Yielding batch {i}") + yield batch + batch = [] + except Exception as e: + logger.warning(f"Error processing item {i}: {str(e)}") + if batch: + yield batch diff --git a/ami/exports/management/commands/export_by_capture.py b/ami/exports/management/commands/export_by_capture.py new file mode 100644 index 000000000..f0f169079 --- /dev/null +++ b/ami/exports/management/commands/export_by_capture.py @@ -0,0 +1,34 @@ +""" +Management command that runs the export_by_capture function in exports.py and reports the progress as it processes and +writes batches. +""" + +import logging + +from django.core.management.base import BaseCommand + +from ami.exports import by_capture +from ami.exports.base import write_export + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Export data by capture" + + def handle(self, *args, **options): + # for i, batch in enumerate(by_capture.get_data_in_batches()): + # # print(f"Processing batch {batch}") + # print(f"Processing batch {i}") + + fname = write_export( + "detections_by_determination_and_capture", + by_capture.DetectionsByDeterminatinonAndCaptureSerializer, + by_capture.get_data_in_batches, + format="csv", + ) + # get full path to the file + print(f"Exported to {fname}") + + logger.info("Export by capture completed") + self.stdout.write(self.style.SUCCESS("Export by capture completed")) diff --git a/ami/exports/models.py b/ami/exports/models.py new file mode 100644 index 000000000..e69de29bb diff --git a/ami/main/management/commands/export_occurrences.py b/ami/main/management/commands/export_occurrences.py new file mode 100644 index 000000000..ad3b2ea95 --- /dev/null +++ b/ami/main/management/commands/export_occurrences.py @@ -0,0 +1,94 @@ +import json +import logging +from typing import Any + +from django.core.management.base import BaseCommand, CommandError +from django.db.models import Q +from django.test.client import RequestFactory +from tqdm import tqdm + +from ami.main.api.serializers import OccurrenceSerializer +from ami.main.models import Deployment, Occurrence, Project + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +class OccurrenceExportSerializer(OccurrenceSerializer): + class Meta(OccurrenceSerializer.Meta): + extra_kwargs = { + "url": {"view_name": "api:occurrence-detail"}, + } + + def to_representation(self, instance): + representation = super().to_representation(instance) + for field in self.fields: + try: + logger.debug(f"Processing field: {field}") + representation[field] = self.fields[field].to_representation(getattr(instance, field)) + except Exception as e: + logger.error(f"Error processing field {field}: {e}") + raise e + return representation + + +class Command(BaseCommand): + help = "Export Occurrence model instances to JSON" + + def add_arguments(self, parser): + parser.add_argument("--project", type=int, help="Filter by project ID") + parser.add_argument("--deployment", type=int, help="Filter by deployment ID") + parser.add_argument("--start-date", type=str, help="Filter by start date (YYYY-MM-DD)") + parser.add_argument("--end-date", type=str, help="Filter by end date (YYYY-MM-DD)") + parser.add_argument("--output", type=str, default="occurrences_export.json", help="Output file name") + parser.add_argument("--limit", type=int, default=10, help="Limit the number of occurrences to export") + parser.add_argument( + "--base-url", type=str, default="http://example.com", help="Base URL for hyperlinked fields" + ) + + def handle(self, *args: Any, **options: Any) -> None: + queryset = Occurrence.objects.all() + + if options["project"]: + try: + project = Project.objects.get(pk=options["project"]) + queryset = queryset.filter(project=project) + except Project.DoesNotExist: + raise CommandError(f"Project with ID {options['project']} does not exist") + + if options["deployment"]: + try: + deployment = Deployment.objects.get(pk=options["deployment"]) + queryset = queryset.filter(deployment=deployment) + except Deployment.DoesNotExist: + raise CommandError(f"Deployment with ID {options['deployment']} does not exist") + + date_filter = Q() + if options["start_date"]: + date_filter &= Q(event__start__gte=options["start_date"]) + if options["end_date"]: + date_filter &= Q(event__start__lte=options["end_date"]) + queryset = queryset.filter(date_filter) + + limit = options["limit"] + queryset = queryset[:limit] + total_occurrences = queryset.count() + self.stdout.write(f"Exporting up to {limit} occurrences...") + + serialized_data = [] + + # Create a fake request for the serializer context + factory = RequestFactory() + fake_request = factory.get("/") + fake_request.META["HTTP_HOST"] = options["base_url"] + + for occurrence in tqdm(queryset, total=total_occurrences, desc="Exporting occurrences"): + serializer = OccurrenceExportSerializer(occurrence, context={"request": fake_request}) + serialized_data.append(serializer.data) + + with open(options["output"], "w") as f: + json.dump(serialized_data, f, indent=2) + + self.stdout.write( + self.style.SUCCESS(f"Successfully exported {total_occurrences} occurrences to {options['output']}") + ) diff --git a/config/settings/base.py b/config/settings/base.py index ba061c10d..f77a950fc 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -100,6 +100,7 @@ "ami.jobs", "ami.ml", "ami.labelstudio", + "ami.exports", ] # https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS @@ -333,8 +334,8 @@ CSRF_TRUSTED_ORIGINS = env.list( "DJANGO_CSRF_TRUSTED_ORIGINS", default=[ - "https://api.dev.insectai.org", - "http://api.dev.insectai.org", + "http://localhost:4000/", + "http://localhost:8000/", EXTERNAL_BASE_URL, ], # type: ignore[no-untyped-call] ) From d2f69c8c5d86cbbfd59704495cb1e51a17a3acef Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 18 Oct 2024 17:54:00 -0700 Subject: [PATCH 03/11] feat: make export functions reusable --- ami/exports/base.py | 50 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/ami/exports/base.py b/ami/exports/base.py index 6aac9c93e..05d51edc5 100644 --- a/ami/exports/base.py +++ b/ami/exports/base.py @@ -1,9 +1,11 @@ import csv import json import logging +from typing import Type from django.core.cache import cache from django.core.files.storage import default_storage +from django.db import models from django.utils import timezone from django.utils.text import slugify from rest_framework import serializers @@ -28,26 +30,56 @@ class BaseExportView(APIView): pass -def write_export(report_name, Serializer, get_data_batch_function, format="csv"): +def get_data_in_batches(QuerySet: models.QuerySet, Serializer: Type[serializers.Serializer], batch_size=1000): + items = QuerySet.iterator(chunk_size=batch_size) + batch = [] + logger.info(f"QuerySet: {QuerySet}") + for i, item in enumerate(items): + # logger.info(f"Processing item {i}") + try: + # item_data = { + # "user_id": item.id, + # "username": item.username, + # "email": item.email, + # "total_orders": Order.objects.filter(user=item).count(), + # "total_spent": Order.objects.filter(user=item).aggregate(total=Sum("total_amount"))["total"] or 0, + # } + serializer = Serializer(item) + item_data = serializer.data + batch.append(item_data) + logger.info(item_data) + + if len(batch) >= batch_size: + logger.info(f"Yielding batch {i}") + yield batch + batch = [] + except Exception as e: + logger.warning(f"Error processing item {i}: {str(e)}") + raise + if batch: + yield batch + + +def write_export(report_name, Serializer: Type[serializers.Serializer], QuerySet: models.QuerySet, format="csv"): timestamp = timezone.now().strftime("%Y%m%d-%H%M%S") file_name = f"{slugify(report_name)}-{timestamp}.{format}" - file_path = f"exports/{file_name}" + # file_path = f"exports/{file_name}" + file_path = file_name try: with default_storage.open(file_path, "w") as file: if format == "csv": writer = csv.writer(file) writer.writerow(Serializer().fields.keys()) # Write header - for batch in get_data_batch_function(): - serializer = Serializer(batch, many=True) - for row in serializer.data: - writer.writerow(row.values()) + for batch in get_data_in_batches(Serializer=Serializer, QuerySet=QuerySet): + for item in batch: + print(item) + writer.writerow(item.values()) else: # JSON file.write("[") first = True - for batch in get_data_batch_function(report_name): - serializer = Serializer(batch, many=True) - for item in serializer.data: + for batch in get_data_in_batches(Serializer=Serializer, QuerySet=QuerySet): + for item in batch: if not first: file.write(",") json.dump(item, file) From b66a53c19a32282b95f2ed26cad35fc6423e7b62 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 18 Oct 2024 17:54:19 -0700 Subject: [PATCH 04/11] fix: add fields to Max's export --- ami/exports/by_capture.py | 120 +++++++++++------- .../management/commands/export_by_capture.py | 6 +- 2 files changed, 76 insertions(+), 50 deletions(-) diff --git a/ami/exports/by_capture.py b/ami/exports/by_capture.py index 5ed583b33..bf61a9a31 100644 --- a/ami/exports/by_capture.py +++ b/ami/exports/by_capture.py @@ -31,27 +31,60 @@ """ import logging +import typing from django.db import models from rest_framework import serializers -from ami.main.models import Detection +from ami.main.models import Detection, Taxon, TaxonRank logger = logging.getLogger(__name__) -class DetectionsByDeterminatinonAndCaptureSerializer(serializers.Serializer): +class DetectionsByDeterminationAndCaptureTabularSerializer(serializers.Serializer): """ Specify the field names, order of fields, and the format of each field value for the export. """ + detection_id = serializers.IntegerField(source="id") + occurrence_id = serializers.IntegerField() capture_id = serializers.IntegerField(source="source_image_id") + latitude = serializers.FloatField() + longitude = serializers.FloatField() + station_name = serializers.CharField() + station_id = serializers.IntegerField() + datetime_observed = serializers.DateTimeField() # date_observed = serializers.DateField() # time_observed = serializers.TimeField() - # latitude = serializers.FloatField() - # longitude = serializers.FloatField() - # taxon_id = serializers.IntegerField() + session_id = serializers.IntegerField() + session_start_datetime = serializers.DateTimeField() + session_end_datetime = serializers.DateTimeField() + session_duration = serializers.DurationField() + # date_observed = serializers.DateField(# Views, serializers and queries for the by_capture export type + determination_score = serializers.FloatField() # taxon_scientific_name = serializers.CharField() + # taxon_rank = serializers.CharField() + taxon_id = serializers.IntegerField() + taxon_name = serializers.CharField() + device_id = serializers.IntegerField() + device_name = serializers.CharField() + + def to_representation(self, instance: typing.Any) -> dict[str, typing.Any]: + data = super().to_representation(instance) + taxon: Taxon = instance.occurrence.determination + + for taxon_rank in taxon.parents_json: + field_name = f"taxon_{taxon_rank.rank.name.lower()}" + data[field_name] = taxon_rank.name + + return data + + def get_fields(self): + fields = super().get_fields() + for rank in TaxonRank: + field_name = f"taxon_{rank.name.lower()}" + fields[field_name] = serializers.CharField(required=False) + return fields def get_queryset(): @@ -59,56 +92,49 @@ def get_queryset(): Detection.objects.all() .select_related( "occurrence", + "occurrence__determination", "source_image", ) .prefetch_related() - .values( - "occurrence_id", - "source_image_id", - "source_image__timestamp", - "source_image__deployment__latitude", - "source_image__deployment__longitude", - "occurrence__determination_id", - "occurrence__determination_score", - ) + # .values( + # "id", + # "occurrence_id", + # "source_image_id", + # "source_image__timestamp", + # "source_image__deployment__latitude", + # "source_image__deployment__longitude", + # "occurrence__determination_id", + # "occurrence__determination_score", + # ) .annotate( + detection_id=models.F("id"), + # occurrence_id=models.F("occurrence_id"), + capture_id=models.F("source_image_id"), + datetime_observed=models.F("source_image__timestamp"), + # date_observed=models.F("source_image__timestamp__date"), + # time_observed=models.F("source_image__timestamp__time"), + latitude=models.F("source_image__deployment__latitude"), + longitude=models.F("source_image__deployment__longitude"), + session_id=models.F("source_image__event_id"), + session_start_datetime=models.F("source_image__event__start"), + session_end_datetime=models.F("source_image__event__end"), + # Calculate session duration + session_duration=models.F("source_image__event__end") - models.F("source_image__event__start"), taxon_scientific_name=models.F("occurrence__determination__display_name"), taxon_rank=models.F("occurrence__determination__rank"), + station_name=models.F("source_image__deployment__name"), + station_id=models.F("source_image__deployment_id"), + taxon_id=models.F("occurrence__determination_id"), + taxon_name=models.F("occurrence__determination__name"), + determination_score=models.F("occurrence__determination_score"), + device_id=models.F("source_image__deployment__device_id"), + device_name=models.F("source_image__deployment__device__name"), + # classification_algorithm_id=models.F("occurrence__determination__classification_algorithm_id"), + # taxon_specific_epithet=models.F("occurrence__determination__specific_epithet"), + # taxon_genus=models.F("occurrence__determination__genus"), # taxon_family=F("determination__family"), # num_detections=Count("occurrence__detections"), # verification_user_ids=F("occurrence__source_image__collection__session__device__verification_users"), ) + # Group the detections by capture and add a count of detections in each capture ) - - -def get_data_in_batches(batch_size=1000): - QuerySet = get_queryset() - items = QuerySet.iterator(chunk_size=batch_size) - batch = [] - logger.info(f"QuerySet: {QuerySet}") - for i, item in enumerate(items): - # logger.info(f"Processing item {i}") - try: - # item_data = { - # "user_id": item.id, - # "username": item.username, - # "email": item.email, - # "total_orders": Order.objects.filter(user=item).count(), - # "total_spent": Order.objects.filter(user=item).aggregate(total=Sum("total_amount"))["total"] or 0, - # } - item_data = item - serializer = DetectionsByDeterminatinonAndCaptureSerializer(data=item_data) - if serializer.is_valid(): - batch.append(serializer.validated_data) - else: - logger.warning(f"Invalid data for item {i}: {serializer.errors}") - logger.info(item_data) - - if len(batch) >= batch_size: - logger.info(f"Yielding batch {i}") - yield batch - batch = [] - except Exception as e: - logger.warning(f"Error processing item {i}: {str(e)}") - if batch: - yield batch diff --git a/ami/exports/management/commands/export_by_capture.py b/ami/exports/management/commands/export_by_capture.py index f0f169079..f29f092dd 100644 --- a/ami/exports/management/commands/export_by_capture.py +++ b/ami/exports/management/commands/export_by_capture.py @@ -17,14 +17,14 @@ class Command(BaseCommand): help = "Export data by capture" def handle(self, *args, **options): - # for i, batch in enumerate(by_capture.get_data_in_batches()): + # for i, batch in enumerate(by_capture.get_data_in_batches()) # # print(f"Processing batch {batch}") # print(f"Processing batch {i}") fname = write_export( "detections_by_determination_and_capture", - by_capture.DetectionsByDeterminatinonAndCaptureSerializer, - by_capture.get_data_in_batches, + Serializer=by_capture.DetectionsByDeterminationAndCaptureTabularSerializer, + QuerySet=by_capture.get_queryset().filter(occurrence__project=85).filter(source_image__collections=82), format="csv", ) # get full path to the file From 11fd4a5b6eddefa7120e20e01d28fa41a78f236f Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 18 Oct 2024 18:09:37 -0700 Subject: [PATCH 05/11] feat: group taxa by capture ID and add counts --- ami/exports/by_capture.py | 42 ++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/ami/exports/by_capture.py b/ami/exports/by_capture.py index bf61a9a31..ae3f681f9 100644 --- a/ami/exports/by_capture.py +++ b/ami/exports/by_capture.py @@ -33,6 +33,7 @@ import logging import typing +from django.contrib.postgres.aggregates import ArrayAgg from django.db import models from rest_framework import serializers @@ -46,13 +47,9 @@ class DetectionsByDeterminationAndCaptureTabularSerializer(serializers.Serialize Specify the field names, order of fields, and the format of each field value for the export. """ - detection_id = serializers.IntegerField(source="id") - occurrence_id = serializers.IntegerField() capture_id = serializers.IntegerField(source="source_image_id") latitude = serializers.FloatField() longitude = serializers.FloatField() - station_name = serializers.CharField() - station_id = serializers.IntegerField() datetime_observed = serializers.DateTimeField() # date_observed = serializers.DateField() # time_observed = serializers.TimeField() @@ -61,17 +58,21 @@ class DetectionsByDeterminationAndCaptureTabularSerializer(serializers.Serialize session_end_datetime = serializers.DateTimeField() session_duration = serializers.DurationField() # date_observed = serializers.DateField(# Views, serializers and queries for the by_capture export type - determination_score = serializers.FloatField() - # taxon_scientific_name = serializers.CharField() - # taxon_rank = serializers.CharField() taxon_id = serializers.IntegerField() taxon_name = serializers.CharField() + taxon_rank = serializers.CharField() + taxon_count = serializers.IntegerField() + determination_score_max = serializers.FloatField() + detection_ids = serializers.CharField() + occurrence_ids = serializers.CharField() + station_name = serializers.CharField() + station_id = serializers.IntegerField() device_id = serializers.IntegerField() device_name = serializers.CharField() def to_representation(self, instance: typing.Any) -> dict[str, typing.Any]: data = super().to_representation(instance) - taxon: Taxon = instance.occurrence.determination + taxon: Taxon = Taxon.objects.get(id=data["taxon_id"]) for taxon_rank in taxon.parents_json: field_name = f"taxon_{taxon_rank.rank.name.lower()}" @@ -95,19 +96,11 @@ def get_queryset(): "occurrence__determination", "source_image", ) - .prefetch_related() - # .values( - # "id", - # "occurrence_id", - # "source_image_id", - # "source_image__timestamp", - # "source_image__deployment__latitude", - # "source_image__deployment__longitude", - # "occurrence__determination_id", - # "occurrence__determination_score", - # ) + .values( + "source_image_id", + "occurrence__determination_id", + ) .annotate( - detection_id=models.F("id"), # occurrence_id=models.F("occurrence_id"), capture_id=models.F("source_image_id"), datetime_observed=models.F("source_image__timestamp"), @@ -120,13 +113,15 @@ def get_queryset(): session_end_datetime=models.F("source_image__event__end"), # Calculate session duration session_duration=models.F("source_image__event__end") - models.F("source_image__event__start"), - taxon_scientific_name=models.F("occurrence__determination__display_name"), - taxon_rank=models.F("occurrence__determination__rank"), station_name=models.F("source_image__deployment__name"), station_id=models.F("source_image__deployment_id"), taxon_id=models.F("occurrence__determination_id"), taxon_name=models.F("occurrence__determination__name"), - determination_score=models.F("occurrence__determination_score"), + taxon_rank=models.F("occurrence__determination__rank"), + determination_score_max=models.Max("occurrence__determination_score"), + taxon_count=models.Count("id"), + detection_ids=ArrayAgg("id"), + occurrence_ids=ArrayAgg("occurrence_id"), device_id=models.F("source_image__deployment__device_id"), device_name=models.F("source_image__deployment__device__name"), # classification_algorithm_id=models.F("occurrence__determination__classification_algorithm_id"), @@ -137,4 +132,5 @@ def get_queryset(): # verification_user_ids=F("occurrence__source_image__collection__session__device__verification_users"), ) # Group the detections by capture and add a count of detections in each capture + .order_by("source_image_id", "-taxon_count", "-determination_score_max") ) From f6dadeaaf457251ef0b72835e694bceda50dd5bf Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 18 Oct 2024 18:35:43 -0700 Subject: [PATCH 06/11] fix: update logging --- ami/exports/base.py | 38 +++++++------------ .../management/commands/export_by_capture.py | 5 ++- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/ami/exports/base.py b/ami/exports/base.py index 05d51edc5..1bfa3add8 100644 --- a/ami/exports/base.py +++ b/ami/exports/base.py @@ -10,6 +10,7 @@ from django.utils.text import slugify from rest_framework import serializers from rest_framework.views import APIView +from tqdm import tqdm logger = logging.getLogger(__name__) @@ -33,7 +34,6 @@ class BaseExportView(APIView): def get_data_in_batches(QuerySet: models.QuerySet, Serializer: Type[serializers.Serializer], batch_size=1000): items = QuerySet.iterator(chunk_size=batch_size) batch = [] - logger.info(f"QuerySet: {QuerySet}") for i, item in enumerate(items): # logger.info(f"Processing item {i}") try: @@ -47,10 +47,8 @@ def get_data_in_batches(QuerySet: models.QuerySet, Serializer: Type[serializers. serializer = Serializer(item) item_data = serializer.data batch.append(item_data) - logger.info(item_data) if len(batch) >= batch_size: - logger.info(f"Yielding batch {i}") yield batch batch = [] except Exception as e: @@ -60,37 +58,27 @@ def get_data_in_batches(QuerySet: models.QuerySet, Serializer: Type[serializers. yield batch -def write_export(report_name, Serializer: Type[serializers.Serializer], QuerySet: models.QuerySet, format="csv"): +def write_export(report_name, Serializer: Type[serializers.Serializer], QuerySet: models.QuerySet): timestamp = timezone.now().strftime("%Y%m%d-%H%M%S") - file_name = f"{slugify(report_name)}-{timestamp}.{format}" - # file_path = f"exports/{file_name}" + file_name = f"{slugify(report_name)}-{timestamp}.csv" file_path = file_name try: with default_storage.open(file_path, "w") as file: - if format == "csv": - writer = csv.writer(file) - writer.writerow(Serializer().fields.keys()) # Write header + writer = csv.writer(file) + writer.writerow(Serializer().fields.keys()) # Write header + + # Calculate total items for progress bar + total_items = QuerySet.count() + + with tqdm(total=total_items, desc="Exporting data", unit="items") as pbar: for batch in get_data_in_batches(Serializer=Serializer, QuerySet=QuerySet): for item in batch: - print(item) writer.writerow(item.values()) - else: # JSON - file.write("[") - first = True - for batch in get_data_in_batches(Serializer=Serializer, QuerySet=QuerySet): - for item in batch: - if not first: - file.write(",") - json.dump(item, file) - first = False - file.write("]") - - # Cache the file path - cache.set(f"export_{report_name}_{format}", file_path, 3600) # Cache for 1 hour + pbar.update(1) - logger.info(f"Export generated successfully: {file_path}") + logger.info(f"CSV export generated successfully: {file_path}") return file_path except Exception as e: - logger.error(f"Error generating export: {str(e)}") + logger.error(f"Error generating CSV export: {str(e)}") raise diff --git a/ami/exports/management/commands/export_by_capture.py b/ami/exports/management/commands/export_by_capture.py index f29f092dd..dff62b658 100644 --- a/ami/exports/management/commands/export_by_capture.py +++ b/ami/exports/management/commands/export_by_capture.py @@ -24,8 +24,9 @@ def handle(self, *args, **options): fname = write_export( "detections_by_determination_and_capture", Serializer=by_capture.DetectionsByDeterminationAndCaptureTabularSerializer, - QuerySet=by_capture.get_queryset().filter(occurrence__project=85).filter(source_image__collections=82), - format="csv", + QuerySet=by_capture.get_queryset() + .filter(occurrence__project=85) + .filter(source_image__collections__in=[82]), ) # get full path to the file print(f"Exported to {fname}") From fd383304b65a185d4052ba36eed0c6a662edd824 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 18 Oct 2024 18:51:06 -0700 Subject: [PATCH 07/11] feat: add date/time columns and by detections format --- ami/exports/by_capture.py | 63 +++---------- ami/exports/by_detection.py | 91 +++++++++++++++++++ .../management/commands/export_by_capture.py | 3 +- .../commands/export_by_detection.py | 36 ++++++++ 4 files changed, 142 insertions(+), 51 deletions(-) create mode 100644 ami/exports/by_detection.py create mode 100644 ami/exports/management/commands/export_by_detection.py diff --git a/ami/exports/by_capture.py b/ami/exports/by_capture.py index ae3f681f9..04ae3a825 100644 --- a/ami/exports/by_capture.py +++ b/ami/exports/by_capture.py @@ -1,40 +1,9 @@ -# Views, serializers and queries for the by_capture export type - -""" -This export should contain the following fields: - -- Capture ID -- Date Observed -- Time Observed -- Latitude -- Longitude -- Taxon ID (include not-moth) -- Count (count of this taxon in one image) -- Taxon scientific name -- Taxon rank -- Taxon specific epithet -- Taxon genus -- Taxon family -- Softmax score -- Num detections (in same capture) -- Station Name -- Session ID -- Session Start Date -- Session duration -- Device ID -- Detection algorithm ID -- Moth/Not moth classifier algorithm ID -- Species Classification Algorithm ID -- Verification user IDs -- Verified -- Verified on -""" - import logging import typing from django.contrib.postgres.aggregates import ArrayAgg from django.db import models +from django.db.models.functions import TruncDate, TruncTime from rest_framework import serializers from ami.main.models import Detection, Taxon, TaxonRank @@ -43,21 +12,20 @@ class DetectionsByDeterminationAndCaptureTabularSerializer(serializers.Serializer): - """ - Specify the field names, order of fields, and the format of each field value for the export. - """ - capture_id = serializers.IntegerField(source="source_image_id") latitude = serializers.FloatField() longitude = serializers.FloatField() datetime_observed = serializers.DateTimeField() - # date_observed = serializers.DateField() - # time_observed = serializers.TimeField() + date_observed = serializers.DateField() + time_observed = serializers.TimeField() session_id = serializers.IntegerField() session_start_datetime = serializers.DateTimeField() + session_start_date = serializers.DateField() + session_start_time = serializers.TimeField() session_end_datetime = serializers.DateTimeField() + session_end_date = serializers.DateField() + session_end_time = serializers.TimeField() session_duration = serializers.DurationField() - # date_observed = serializers.DateField(# Views, serializers and queries for the by_capture export type taxon_id = serializers.IntegerField() taxon_name = serializers.CharField() taxon_rank = serializers.CharField() @@ -101,17 +69,19 @@ def get_queryset(): "occurrence__determination_id", ) .annotate( - # occurrence_id=models.F("occurrence_id"), capture_id=models.F("source_image_id"), datetime_observed=models.F("source_image__timestamp"), - # date_observed=models.F("source_image__timestamp__date"), - # time_observed=models.F("source_image__timestamp__time"), + date_observed=TruncDate("source_image__timestamp"), + time_observed=TruncTime("source_image__timestamp"), latitude=models.F("source_image__deployment__latitude"), longitude=models.F("source_image__deployment__longitude"), session_id=models.F("source_image__event_id"), session_start_datetime=models.F("source_image__event__start"), + session_start_date=TruncDate("source_image__event__start"), + session_start_time=TruncTime("source_image__event__start"), session_end_datetime=models.F("source_image__event__end"), - # Calculate session duration + session_end_date=TruncDate("source_image__event__end"), + session_end_time=TruncTime("source_image__event__end"), session_duration=models.F("source_image__event__end") - models.F("source_image__event__start"), station_name=models.F("source_image__deployment__name"), station_id=models.F("source_image__deployment_id"), @@ -124,13 +94,6 @@ def get_queryset(): occurrence_ids=ArrayAgg("occurrence_id"), device_id=models.F("source_image__deployment__device_id"), device_name=models.F("source_image__deployment__device__name"), - # classification_algorithm_id=models.F("occurrence__determination__classification_algorithm_id"), - # taxon_specific_epithet=models.F("occurrence__determination__specific_epithet"), - # taxon_genus=models.F("occurrence__determination__genus"), - # taxon_family=F("determination__family"), - # num_detections=Count("occurrence__detections"), - # verification_user_ids=F("occurrence__source_image__collection__session__device__verification_users"), ) - # Group the detections by capture and add a count of detections in each capture .order_by("source_image_id", "-taxon_count", "-determination_score_max") ) diff --git a/ami/exports/by_detection.py b/ami/exports/by_detection.py new file mode 100644 index 000000000..4763f7382 --- /dev/null +++ b/ami/exports/by_detection.py @@ -0,0 +1,91 @@ +import logging +import typing + +from django.db import models +from django.db.models.functions import TruncDate, TruncTime +from rest_framework import serializers + +from ami.main.models import Detection, Taxon, TaxonRank + +logger = logging.getLogger(__name__) + + +class DetectionsTabularSerializer(serializers.Serializer): + detection_id = serializers.IntegerField(source="id") + occurrence_id = serializers.IntegerField() + capture_id = serializers.IntegerField(source="source_image_id") + latitude = serializers.FloatField() + longitude = serializers.FloatField() + datetime_observed = serializers.DateTimeField() + date_observed = serializers.DateField() + time_observed = serializers.TimeField() + session_id = serializers.IntegerField() + session_start_datetime = serializers.DateTimeField() + session_start_date = serializers.DateField() + session_start_time = serializers.TimeField() + session_end_datetime = serializers.DateTimeField() + session_end_date = serializers.DateField() + session_end_time = serializers.TimeField() + session_duration = serializers.DurationField() + taxon_id = serializers.IntegerField() + taxon_name = serializers.CharField() + taxon_rank = serializers.CharField() + determination_score = serializers.FloatField() + station_name = serializers.CharField() + station_id = serializers.IntegerField() + device_id = serializers.IntegerField() + device_name = serializers.CharField() + + def to_representation(self, instance: typing.Any) -> dict[str, typing.Any]: + data = super().to_representation(instance) + taxon: Taxon = Taxon.objects.get(id=data["taxon_id"]) + + for taxon_rank in taxon.parents_json: + field_name = f"taxon_{taxon_rank.rank.name.lower()}" + data[field_name] = taxon_rank.name + + return data + + def get_fields(self): + fields = super().get_fields() + for rank in TaxonRank: + field_name = f"taxon_{rank.name.lower()}" + fields[field_name] = serializers.CharField(required=False) + return fields + + +def get_queryset(): + return ( + Detection.objects.all() + .select_related( + "occurrence", + "occurrence__determination", + "source_image", + ) + .annotate( + capture_id=models.F("source_image_id"), + datetime_observed=models.F("source_image__timestamp"), + date_observed=TruncDate("source_image__timestamp"), + time_observed=TruncTime("source_image__timestamp"), + latitude=models.F("source_image__deployment__latitude"), + longitude=models.F("source_image__deployment__longitude"), + session_id=models.F("source_image__event_id"), + session_start_datetime=models.F("source_image__event__start"), + session_start_date=TruncDate("source_image__event__start"), + session_start_time=TruncTime("source_image__event__start"), + session_end_datetime=models.F("source_image__event__end"), + session_end_date=TruncDate("source_image__event__end"), + session_end_time=TruncTime("source_image__event__end"), + session_duration=models.F("source_image__event__end") - models.F("source_image__event__start"), + station_name=models.F("source_image__deployment__name"), + station_id=models.F("source_image__deployment_id"), + taxon_id=models.F("occurrence__determination_id"), + taxon_name=models.F("occurrence__determination__name"), + taxon_rank=models.F("occurrence__determination__rank"), + determination_score=models.F("occurrence__determination_score"), + taxon_count=models.Count("id"), + device_id=models.F("source_image__deployment__device_id"), + device_name=models.F("source_image__deployment__device__name"), + ) + .order_by("source_image_id", "-determination_score") + ) diff --git a/ami/exports/management/commands/export_by_capture.py b/ami/exports/management/commands/export_by_capture.py index dff62b658..2e0939880 100644 --- a/ami/exports/management/commands/export_by_capture.py +++ b/ami/exports/management/commands/export_by_capture.py @@ -26,7 +26,8 @@ def handle(self, *args, **options): Serializer=by_capture.DetectionsByDeterminationAndCaptureTabularSerializer, QuerySet=by_capture.get_queryset() .filter(occurrence__project=85) - .filter(source_image__collections__in=[82]), + .filter(source_image__collections__in=[82, 79]), + # .filter(source_image__collections__in=[82]), ) # get full path to the file print(f"Exported to {fname}") diff --git a/ami/exports/management/commands/export_by_detection.py b/ami/exports/management/commands/export_by_detection.py new file mode 100644 index 000000000..e8834d0cf --- /dev/null +++ b/ami/exports/management/commands/export_by_detection.py @@ -0,0 +1,36 @@ +""" +Management command that runs the export_by_capture function in exports.py and reports the progress as it processes and +writes batches. +""" + +import logging + +from django.core.management.base import BaseCommand + +from ami.exports import by_detection +from ami.exports.base import write_export + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Export data by capture" + + def handle(self, *args, **options): + # for i, batch in enumerate(by_capture.get_data_in_batches()) + # # print(f"Processing batch {batch}") + # print(f"Processing batch {i}") + + fname = write_export( + "detections", + Serializer=by_detection.DetectionsTabularSerializer, + QuerySet=by_detection.get_queryset() + .filter(occurrence__project=85) + .filter(source_image__collections__in=[82, 79]), + # .filter(source_image__collections__in=[82]), + ) + # get full path to the file + print(f"Exported to {fname}") + + logger.info("Export by capture completed") + self.stdout.write(self.style.SUCCESS("Export by capture completed")) From 2306766eca635fe561c5b36d3b7de1378c0bc263 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Sat, 19 Oct 2024 00:07:44 -0700 Subject: [PATCH 08/11] fix: get taxon differently --- ami/exports/by_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ami/exports/by_detection.py b/ami/exports/by_detection.py index 4763f7382..c11796641 100644 --- a/ami/exports/by_detection.py +++ b/ami/exports/by_detection.py @@ -38,7 +38,7 @@ class DetectionsTabularSerializer(serializers.Serializer): def to_representation(self, instance: typing.Any) -> dict[str, typing.Any]: data = super().to_representation(instance) - taxon: Taxon = Taxon.objects.get(id=data["taxon_id"]) + taxon: Taxon = instance.occurrence.determination for taxon_rank in taxon.parents_json: field_name = f"taxon_{taxon_rank.rank.name.lower()}" From 4399e26324093cdc79c5ec45f786b7d9e9f8684d Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 21 Oct 2024 22:25:46 -0700 Subject: [PATCH 09/11] feat: exports for sessions & capture summaries --- ami/exports/all_captures.py | 63 +++++++++++++++++++ ami/exports/all_sessions.py | 58 +++++++++++++++++ .../management/commands/export_captures.py | 34 ++++++++++ .../management/commands/export_sessions.py | 33 ++++++++++ 4 files changed, 188 insertions(+) create mode 100644 ami/exports/all_captures.py create mode 100644 ami/exports/all_sessions.py create mode 100644 ami/exports/management/commands/export_captures.py create mode 100644 ami/exports/management/commands/export_sessions.py diff --git a/ami/exports/all_captures.py b/ami/exports/all_captures.py new file mode 100644 index 000000000..43bd895ac --- /dev/null +++ b/ami/exports/all_captures.py @@ -0,0 +1,63 @@ +import logging +import typing + +from django.db import models +from django.db.models.functions import TruncDate, TruncTime +from rest_framework import serializers + +from ami.main.models import Detection, SourceImage, Taxon, TaxonRank + +logger = logging.getLogger(__name__) + + +class CapturesTabularSerializer(serializers.Serializer): + capture_id = serializers.IntegerField(source="id") + latitude = serializers.FloatField() + longitude = serializers.FloatField() + datetime_observed = serializers.DateTimeField() + date_observed = serializers.DateField() + time_observed = serializers.TimeField() + session_id = serializers.IntegerField() + session_start_datetime = serializers.DateTimeField() + session_start_date = serializers.DateField() + session_start_time = serializers.TimeField() + session_end_datetime = serializers.DateTimeField() + session_end_date = serializers.DateField() + session_end_time = serializers.TimeField() + session_duration = serializers.DurationField() + station_name = serializers.CharField() + station_id = serializers.IntegerField() + device_id = serializers.IntegerField() + device_name = serializers.CharField() + detections_count = serializers.IntegerField(source="detections_count_fresh") + occurrences_count = serializers.IntegerField() + taxa_count = serializers.IntegerField() + + +def get_queryset(): + return ( + SourceImage.objects.all() + .annotate( + datetime_observed=models.F("timestamp"), + date_observed=TruncDate("timestamp"), + time_observed=TruncTime("timestamp"), + latitude=models.F("deployment__latitude"), + longitude=models.F("deployment__longitude"), + session_id=models.F("event_id"), + session_start_datetime=models.F("event__start"), + session_start_date=TruncDate("event__start"), + session_start_time=TruncTime("event__start"), + session_end_datetime=models.F("event__end"), + session_end_date=TruncDate("event__end"), + session_end_time=TruncTime("event__end"), + session_duration=models.F("event__end") - models.F("event__start"), + station_name=models.F("deployment__name"), + station_id=models.F("deployment_id"), + device_id=models.F("deployment__device_id"), + device_name=models.F("deployment__device__name"), + detections_count_fresh=models.Count("detections", distinct=True), + occurrences_count=models.Count("detections__occurrence", distinct=True), + taxa_count=models.Count("detections__occurrence__determination", distinct=True), + ) + .order_by("datetime_observed") + ) diff --git a/ami/exports/all_sessions.py b/ami/exports/all_sessions.py new file mode 100644 index 000000000..464651223 --- /dev/null +++ b/ami/exports/all_sessions.py @@ -0,0 +1,58 @@ +import logging +import typing + +from django.db import models +from django.db.models.functions import TruncDate, TruncTime +from rest_framework import serializers + +from ami.main.models import Event + +logger = logging.getLogger(__name__) + + +class SessionsTabularSerializer(serializers.Serializer): + session_id = serializers.IntegerField(source="id") + session_start_datetime = serializers.DateTimeField() + session_start_date = serializers.DateField() + session_start_time = serializers.TimeField() + session_end_datetime = serializers.DateTimeField() + session_end_date = serializers.DateField() + session_end_time = serializers.TimeField() + session_duration = serializers.DurationField() + latitude = serializers.FloatField() + longitude = serializers.FloatField() + station_name = serializers.CharField() + station_id = serializers.IntegerField() + device_id = serializers.IntegerField() + device_name = serializers.CharField() + captures_count = serializers.IntegerField(source="captures_count_fresh") + detections_count = serializers.IntegerField(source="detections_count_fresh") + occurrences_count = serializers.IntegerField() + taxa_count = serializers.IntegerField() + + +def get_queryset(): + return ( + Event.objects.all() + .annotate( + session_id=models.F("id"), + session_start_datetime=models.F("start"), + session_start_date=TruncDate("start"), + session_start_time=TruncTime("start"), + session_end_datetime=models.F("end"), + session_end_date=TruncDate("end"), + session_end_time=TruncTime("end"), + session_duration=models.F("end") - models.F("start"), + latitude=models.F("deployment__latitude"), + longitude=models.F("deployment__longitude"), + station_name=models.F("deployment__name"), + station_id=models.F("deployment_id"), + device_id=models.F("deployment__device_id"), + device_name=models.F("deployment__device__name"), + captures_count_fresh=models.Count("captures", distinct=True), + detections_count_fresh=models.Count("captures__detections", distinct=True), + occurrences_count_fresh=models.Count("captures__detections__occurrence", distinct=True), + taxa_count=models.Count("captures__detections__occurrence__determination", distinct=True), + ) + .order_by("session_start_datetime", "station_id") + ) diff --git a/ami/exports/management/commands/export_captures.py b/ami/exports/management/commands/export_captures.py new file mode 100644 index 000000000..18f91e503 --- /dev/null +++ b/ami/exports/management/commands/export_captures.py @@ -0,0 +1,34 @@ +""" +Management command that runs the export_by_capture function in exports.py and reports the progress as it processes and +writes batches. +""" + +import logging + +from django.core.management.base import BaseCommand + +from ami.exports import all_captures +from ami.exports.base import write_export + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Export data by capture" + + def handle(self, *args, **options): + # for i, batch in enumerate(by_capture.get_data_in_batches()) + # # print(f"Processing batch {batch}") + # print(f"Processing batch {i}") + + fname = write_export( + "captures", + Serializer=all_captures.CapturesTabularSerializer, + QuerySet=all_captures.get_queryset().filter(project=85).filter(collections__in=[82, 79]), + # .filter(collections__in=[82]), + ) + # get full path to the file + print(f"Exported to {fname}") + + logger.info("Export by capture completed") + self.stdout.write(self.style.SUCCESS("Export by capture completed")) diff --git a/ami/exports/management/commands/export_sessions.py b/ami/exports/management/commands/export_sessions.py new file mode 100644 index 000000000..658180387 --- /dev/null +++ b/ami/exports/management/commands/export_sessions.py @@ -0,0 +1,33 @@ +""" +Management command that runs the export_by_capture function in exports.py and reports the progress as it processes and +writes batches. +""" + +import logging + +from django.core.management.base import BaseCommand + +from ami.exports import all_sessions +from ami.exports.base import write_export + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Export data by capture" + + def handle(self, *args, **options): + # for i, batch in enumerate(by_capture.get_data_in_batches()) + # # print(f"Processing batch {batch}") + # print(f"Processing batch {i}") + + fname = write_export( + "sessions", + Serializer=all_sessions.SessionsTabularSerializer, + QuerySet=all_sessions.get_queryset().filter(project=85), + ) + # get full path to the file + print(f"Exported to {fname}") + + logger.info("Export by capture completed") + self.stdout.write(self.style.SUCCESS("Export by capture completed")) From 272036834c58835d64932afee7bf62210598a3ff Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 27 Nov 2024 18:24:48 -0800 Subject: [PATCH 10/11] fix: unused imports --- ami/exports/all_captures.py | 3 +-- ami/exports/all_sessions.py | 1 - ami/exports/base.py | 7 ++----- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/ami/exports/all_captures.py b/ami/exports/all_captures.py index 43bd895ac..7531c994d 100644 --- a/ami/exports/all_captures.py +++ b/ami/exports/all_captures.py @@ -1,11 +1,10 @@ import logging -import typing from django.db import models from django.db.models.functions import TruncDate, TruncTime from rest_framework import serializers -from ami.main.models import Detection, SourceImage, Taxon, TaxonRank +from ami.main.models import SourceImage logger = logging.getLogger(__name__) diff --git a/ami/exports/all_sessions.py b/ami/exports/all_sessions.py index 464651223..cc58c5c46 100644 --- a/ami/exports/all_sessions.py +++ b/ami/exports/all_sessions.py @@ -1,5 +1,4 @@ import logging -import typing from django.db import models from django.db.models.functions import TruncDate, TruncTime diff --git a/ami/exports/base.py b/ami/exports/base.py index 1bfa3add8..76c47f699 100644 --- a/ami/exports/base.py +++ b/ami/exports/base.py @@ -1,9 +1,6 @@ import csv -import json import logging -from typing import Type -from django.core.cache import cache from django.core.files.storage import default_storage from django.db import models from django.utils import timezone @@ -31,7 +28,7 @@ class BaseExportView(APIView): pass -def get_data_in_batches(QuerySet: models.QuerySet, Serializer: Type[serializers.Serializer], batch_size=1000): +def get_data_in_batches(QuerySet: models.QuerySet, Serializer: type[serializers.Serializer], batch_size=1000): items = QuerySet.iterator(chunk_size=batch_size) batch = [] for i, item in enumerate(items): @@ -58,7 +55,7 @@ def get_data_in_batches(QuerySet: models.QuerySet, Serializer: Type[serializers. yield batch -def write_export(report_name, Serializer: Type[serializers.Serializer], QuerySet: models.QuerySet): +def write_export(report_name, Serializer: type[serializers.Serializer], QuerySet: models.QuerySet): timestamp = timezone.now().strftime("%Y%m%d-%H%M%S") file_name = f"{slugify(report_name)}-{timestamp}.csv" file_path = file_name From a362fe47e5534a7555a63aa82aa32b314b598b95 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 10 Mar 2025 16:37:01 -0700 Subject: [PATCH 11/11] fix: add arguments and fix export commands made for SEC-SEQ Symposium. --- ami/exports/base.py | 13 +++++-- ami/exports/by_capture.py | 14 +++++--- .../management/commands/export_by_capture.py | 34 ++++++++++++++----- .../commands/export_by_detection.py | 33 ++++++++++++++---- .../management/commands/export_captures.py | 25 ++++++++++++-- .../management/commands/export_sessions.py | 13 +++++-- 6 files changed, 106 insertions(+), 26 deletions(-) diff --git a/ami/exports/base.py b/ami/exports/base.py index 76c47f699..df3690131 100644 --- a/ami/exports/base.py +++ b/ami/exports/base.py @@ -1,5 +1,6 @@ import csv import logging +import typing from django.core.files.storage import default_storage from django.db import models @@ -28,7 +29,11 @@ class BaseExportView(APIView): pass -def get_data_in_batches(QuerySet: models.QuerySet, Serializer: type[serializers.Serializer], batch_size=1000): +def get_data_in_batches( + QuerySet: models.QuerySet, + Serializer: type[serializers.Serializer], + batch_size: int = 1000, +) -> typing.Iterator[list[dict]]: items = QuerySet.iterator(chunk_size=batch_size) batch = [] for i, item in enumerate(items): @@ -55,7 +60,11 @@ def get_data_in_batches(QuerySet: models.QuerySet, Serializer: type[serializers. yield batch -def write_export(report_name, Serializer: type[serializers.Serializer], QuerySet: models.QuerySet): +def write_export( + report_name: str, + Serializer: type[serializers.Serializer], + QuerySet: models.QuerySet, +) -> str: timestamp = timezone.now().strftime("%Y%m%d-%H%M%S") file_name = f"{slugify(report_name)}-{timestamp}.csv" file_path = file_name diff --git a/ami/exports/by_capture.py b/ami/exports/by_capture.py index 04ae3a825..534ea3223 100644 --- a/ami/exports/by_capture.py +++ b/ami/exports/by_capture.py @@ -40,11 +40,15 @@ class DetectionsByDeterminationAndCaptureTabularSerializer(serializers.Serialize def to_representation(self, instance: typing.Any) -> dict[str, typing.Any]: data = super().to_representation(instance) - taxon: Taxon = Taxon.objects.get(id=data["taxon_id"]) - - for taxon_rank in taxon.parents_json: - field_name = f"taxon_{taxon_rank.rank.name.lower()}" - data[field_name] = taxon_rank.name + try: + taxon: Taxon = Taxon.objects.get(id=data["taxon_id"]) + except Taxon.DoesNotExist: + logger.warning(f"Taxon with ID '{data['taxon_id']}' not found") + pass + else: + for taxon_rank in taxon.parents_json: + field_name = f"taxon_{taxon_rank.rank.name.lower()}" + data[field_name] = taxon_rank.name return data diff --git a/ami/exports/management/commands/export_by_capture.py b/ami/exports/management/commands/export_by_capture.py index 2e0939880..cf5b1c757 100644 --- a/ami/exports/management/commands/export_by_capture.py +++ b/ami/exports/management/commands/export_by_capture.py @@ -4,8 +4,10 @@ """ import logging +import typing from django.core.management.base import BaseCommand +from django.db import models from ami.exports import by_capture from ami.exports.base import write_export @@ -16,18 +18,34 @@ class Command(BaseCommand): help = "Export data by capture" - def handle(self, *args, **options): - # for i, batch in enumerate(by_capture.get_data_in_batches()) - # # print(f"Processing batch {batch}") - # print(f"Processing batch {i}") + def add_arguments(self, parser) -> None: + parser.add_argument( + "--project-id", + type=int, + required=True, + help="Project ID to export data from", + ) + parser.add_argument( + "--collection-ids", + type=int, + nargs="+", + required=False, + default=[], + help="Collection IDs to export data from (space-separated list)", + ) + + def handle(self, *args, **options) -> None: + project_id: int = options["project_id"] + collection_ids: list[int] = options["collection_ids"] + + qs = by_capture.get_queryset().filter(occurrence__project=project_id) + if collection_ids: + qs = qs.filter(source_image__collections__in=collection_ids) fname = write_export( "detections_by_determination_and_capture", Serializer=by_capture.DetectionsByDeterminationAndCaptureTabularSerializer, - QuerySet=by_capture.get_queryset() - .filter(occurrence__project=85) - .filter(source_image__collections__in=[82, 79]), - # .filter(source_image__collections__in=[82]), + QuerySet=typing.cast(models.QuerySet, qs), ) # get full path to the file print(f"Exported to {fname}") diff --git a/ami/exports/management/commands/export_by_detection.py b/ami/exports/management/commands/export_by_detection.py index e8834d0cf..7fb36a46a 100644 --- a/ami/exports/management/commands/export_by_detection.py +++ b/ami/exports/management/commands/export_by_detection.py @@ -14,23 +14,42 @@ class Command(BaseCommand): - help = "Export data by capture" + help = "Export data by detection and determination" + + def add_arguments(self, parser) -> None: + parser.add_argument( + "--project-id", + type=int, + required=True, + help="Project ID to export data from", + ) + parser.add_argument( + "--collection-ids", + type=int, + nargs="+", + required=False, + default=[], + help="Collection IDs to export data from (space-separated list)", + ) def handle(self, *args, **options): # for i, batch in enumerate(by_capture.get_data_in_batches()) # # print(f"Processing batch {batch}") # print(f"Processing batch {i}") + project_id: int = options["project_id"] + collection_ids: list[int] = options["collection_ids"] + + qs = by_detection.get_queryset().filter(occurrence__project=project_id) + if collection_ids: + qs = qs.filter(source_image__collections__in=collection_ids) fname = write_export( "detections", Serializer=by_detection.DetectionsTabularSerializer, - QuerySet=by_detection.get_queryset() - .filter(occurrence__project=85) - .filter(source_image__collections__in=[82, 79]), - # .filter(source_image__collections__in=[82]), + QuerySet=qs, ) # get full path to the file print(f"Exported to {fname}") - logger.info("Export by capture completed") - self.stdout.write(self.style.SUCCESS("Export by capture completed")) + logger.info("Export by detection completed") + self.stdout.write(self.style.SUCCESS("Export by detection completed")) diff --git a/ami/exports/management/commands/export_captures.py b/ami/exports/management/commands/export_captures.py index 18f91e503..6d2391ad0 100644 --- a/ami/exports/management/commands/export_captures.py +++ b/ami/exports/management/commands/export_captures.py @@ -16,16 +16,37 @@ class Command(BaseCommand): help = "Export data by capture" + def add_arguments(self, parser) -> None: + parser.add_argument( + "--project-id", + type=int, + required=True, + help="Project ID to export data from", + ) + parser.add_argument( + "--collection-ids", + type=int, + nargs="+", + required=False, + default=[], + help="Collection IDs to export data from (space-separated list)", + ) + def handle(self, *args, **options): # for i, batch in enumerate(by_capture.get_data_in_batches()) # # print(f"Processing batch {batch}") # print(f"Processing batch {i}") + project_id: int = options["project_id"] + collection_ids: list[int] = options["collection_ids"] + + qs = all_captures.get_queryset().filter(project=project_id) + if collection_ids: + qs = qs.filter(collections__in=collection_ids) fname = write_export( "captures", Serializer=all_captures.CapturesTabularSerializer, - QuerySet=all_captures.get_queryset().filter(project=85).filter(collections__in=[82, 79]), - # .filter(collections__in=[82]), + QuerySet=qs, ) # get full path to the file print(f"Exported to {fname}") diff --git a/ami/exports/management/commands/export_sessions.py b/ami/exports/management/commands/export_sessions.py index 658180387..b3f33dba9 100644 --- a/ami/exports/management/commands/export_sessions.py +++ b/ami/exports/management/commands/export_sessions.py @@ -5,7 +5,7 @@ import logging -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandParser from ami.exports import all_sessions from ami.exports.base import write_export @@ -16,15 +16,24 @@ class Command(BaseCommand): help = "Export data by capture" + def add_arguments(self, parser: CommandParser) -> None: + parser.add_argument( + "--project-id", + type=int, + required=True, + help="Project ID to export data from", + ) + def handle(self, *args, **options): # for i, batch in enumerate(by_capture.get_data_in_batches()) # # print(f"Processing batch {batch}") # print(f"Processing batch {i}") + project_id: int = options["project_id"] fname = write_export( "sessions", Serializer=all_sessions.SessionsTabularSerializer, - QuerySet=all_sessions.get_queryset().filter(project=85), + QuerySet=all_sessions.get_queryset().filter(project=project_id), ) # get full path to the file print(f"Exported to {fname}")