Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
132f9dc
feat: Added pgvector extension
mohamedelabbas1996 Apr 14, 2025
dc48ccc
feat: Added features field to the Classification model
mohamedelabbas1996 Apr 14, 2025
8dc0c00
changed taxon and detection to autocomplete fields in the Classificat…
mohamedelabbas1996 Apr 16, 2025
4bf07b3
feat: added similar action to the ClassificationViewset
mohamedelabbas1996 Apr 16, 2025
b258c9b
chore: changed features vector field name to features_2048
mohamedelabbas1996 Apr 16, 2025
9490045
chore: changed features vector field name to features_2048
mohamedelabbas1996 Apr 16, 2025
0ff569f
feat: read features vector from processing service ClassificationResp…
mohamedelabbas1996 Apr 16, 2025
89a3b6c
test: added tests for PGVector distance metrics
mohamedelabbas1996 Apr 16, 2025
9e13cc4
updated docker-compose.ci.yml to use the same postgres image
mohamedelabbas1996 Apr 17, 2025
5a51593
updated docker-compose.ci.yml to use the same postgres image as docke…
mohamedelabbas1996 Apr 17, 2025
9efff5f
updated docker-compose.ci.yml to use the same postgres image as docke…
mohamedelabbas1996 Apr 17, 2025
1c66f34
feat: Added support for clustering detections for source image collec…
mohamedelabbas1996 Apr 29, 2025
99a7f3f
feat: Allowed triggering collection detections clustering from admin …
mohamedelabbas1996 Apr 29, 2025
83f2c08
fix: show unobserved Taxa in view for now
mihow Apr 29, 2025
5420f85
fix: create & update occurrence determinations after clustering
mihow Apr 29, 2025
6b0020d
feat: add unknown species filter to admin
mihow Apr 29, 2025
856035d
Merge branch 'deployments/ood.antenna.insectai.org' of github.com:Rol…
mihow Apr 30, 2025
036d81d
Merge branch 'deployments/ood.antenna.insectai.org' of github.com:Rol…
mihow Apr 30, 2025
4f8b09b
fix: circular import
mihow Apr 30, 2025
d255085
fix: update migration ordering
mihow Apr 30, 2025
2e12b56
Integrated Agglomerative clustering
mohamedelabbas1996 Apr 30, 2025
a301dc7
Merge branch 'feat/add-clustering' of https://github.com/RolnickLab/a…
mohamedelabbas1996 Apr 30, 2025
10820bb
updated clustering request params
mohamedelabbas1996 Apr 30, 2025
225529e
fixed Agglomerative clustering
mohamedelabbas1996 Apr 30, 2025
0423523
fix: disable missing clustering algorithms
mihow May 1, 2025
cb894f4
fix: syntax when creating algorithm entry
mihow May 1, 2025
39d9b6c
feat: command to create clustering job without starting it
mihow May 1, 2025
abd9cf1
feat: increase default batch size
mihow May 1, 2025
b2a7b3f
fix: better algorithm name
mihow May 1, 2025
bf67d06
feat: allow sorting by OOD score
mihow May 1, 2025
ce08f6a
Merge branch 'deployments/ood.antenna.insectai.org' of github.com:Rol…
mihow May 1, 2025
853b69d
feat: add unknown species and other fields to Taxon serializer
mihow May 1, 2025
6586872
fix: remove missing field
mihow May 1, 2025
b242079
fix: migration conflicts
mihow May 1, 2025
fe744f0
feat: fields for investigating occurrence classifications in admin
mihow May 2, 2025
4ac88da
fix: filter by feature extraction algorithm
mohamedelabbas1996 May 5, 2025
6d44bdb
chore: Used a serializer to handle job params instead of reading them…
mohamedelabbas1996 May 5, 2025
12b4ee4
set default ood threshold to 0.0
mohamedelabbas1996 May 5, 2025
2c73795
test: added tests for clustering
mohamedelabbas1996 May 5, 2025
e5d7ff0
chore: migration for new algorithm type
mihow May 6, 2025
9ad77f7
Merge branch 'deployments/ood.antenna.insectai.org' of github.com:Rol…
mihow May 6, 2025
fdbbf75
fix: remove cluster action in Event admin until its ready
mihow May 6, 2025
0e92904
chore: move algorithm selection to dedicated function
mihow May 7, 2025
b26fbe0
fix: update clustering tests and types
mihow May 7, 2025
4032aff
chore: remove external network config in processing services
mihow May 7, 2025
0f8c544
feat: update GitHub workflows to run tests on other branches
mihow May 7, 2025
5fb5c43
fix: hide unobserved taxa by default
mihow May 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/test.backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ env:

on:
pull_request:
branches: ["master", "main"]
branches: ["main", "deployments/*", "releases/*"]
paths-ignore: ["docs/**", "ui/**"]

push:
branches: ["master", "main"]
branches: ["main", "deployments/*", "releases/*"]
paths-ignore: ["docs/**", "ui/**"]

concurrency:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.frontend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ env:

on:
pull_request:
branches: ["master", "main"]
branches: ["main", "deployments/*", "releases/*"]
paths:
- "!./**"
- "ui/**"

push:
branches: ["master", "main"]
branches: ["main", "deployments/*", "releases/*"]
paths:
- "!./**"
- "ui/**"
Expand Down
29 changes: 29 additions & 0 deletions ami/jobs/migrations/0017_alter_job_job_type_key.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Generated by Django 4.2.10 on 2025-04-24 16:25

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("jobs", "0016_job_data_export_job_params_alter_job_job_type_key"),
]

operations = [
migrations.AlterField(
model_name="job",
name="job_type_key",
field=models.CharField(
choices=[
("ml", "ML pipeline"),
("populate_captures_collection", "Populate captures collection"),
("data_storage_sync", "Data storage sync"),
("unknown", "Unknown"),
("data_export", "Data Export"),
("occurrence_clustering", "Occurrence Feature Clustering"),
],
default="unknown",
max_length=255,
verbose_name="Job Type",
),
),
]
29 changes: 29 additions & 0 deletions ami/jobs/migrations/0018_alter_job_job_type_key.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Generated by Django 4.2.10 on 2025-04-28 11:06

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("jobs", "0017_alter_job_job_type_key"),
]

operations = [
migrations.AlterField(
model_name="job",
name="job_type_key",
field=models.CharField(
choices=[
("ml", "ML pipeline"),
("populate_captures_collection", "Populate captures collection"),
("data_storage_sync", "Data storage sync"),
("unknown", "Unknown"),
("data_export", "Data Export"),
("detection_clustering", "Detection Feature Clustering"),
],
default="unknown",
max_length=255,
verbose_name="Job Type",
),
),
]
44 changes: 42 additions & 2 deletions ami/jobs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,8 @@ def run(cls, job: "Job"):
total_classifications = 0

config = job.pipeline.get_config(project_id=job.project.pk)
chunk_size = config.get("request_source_image_batch_size", 1)
chunk_size = config.get("request_source_image_batch_size", 2)
# @TODO Ensure only images of the same dimensions are processed in a batch
chunks = [images[i : i + chunk_size] for i in range(0, image_count, chunk_size)] # noqa
request_failed_images = []

Expand Down Expand Up @@ -639,6 +640,38 @@ def run(cls, job: "Job"):
job.update_status(JobState.SUCCESS, save=True)


class DetectionClusteringJob(JobType):
name = "Detection Feature Clustering"
key = "detection_clustering"

@classmethod
def run(cls, job: "Job"):
job.update_status(JobState.STARTED)
job.started_at = datetime.datetime.now()
job.finished_at = None
job.progress.add_stage(name="Collecting Features", key="feature_collection")
job.progress.add_stage("Clustering", key="clustering")
job.progress.add_stage("Creating Unknown Taxa", key="create_unknown_taxa")
job.save()

if not job.source_image_collection:
raise ValueError("No source image collection provided")

job.logger.info(f"Clustering detections for collection {job.source_image_collection}")
job.update_status(JobState.STARTED)
job.started_at = datetime.datetime.now()
job.finished_at = None
job.save()

# Call the clustering method
job.source_image_collection.cluster_detections(job=job)
job.logger.info(f"Finished clustering detections for collection {job.source_image_collection}")

job.finished_at = datetime.datetime.now()
job.update_status(JobState.SUCCESS, save=False)
job.save()


class UnknownJobType(JobType):
name = "Unknown"
key = "unknown"
Expand All @@ -648,7 +681,14 @@ def run(cls, job: "Job"):
raise ValueError(f"Unknown job type '{job.job_type()}'")


VALID_JOB_TYPES = [MLJob, SourceImageCollectionPopulateJob, DataStorageSyncJob, UnknownJobType, DataExportJob]
VALID_JOB_TYPES = [
MLJob,
SourceImageCollectionPopulateJob,
DataStorageSyncJob,
UnknownJobType,
DataExportJob,
DetectionClusteringJob,
]


def get_job_type_by_key(key: str) -> type[JobType] | None:
Expand Down
61 changes: 58 additions & 3 deletions ami/main/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from django.http.request import HttpRequest
from django.template.defaultfilters import filesizeformat
from django.utils.formats import number_format
from django.utils.html import format_html
from guardian.admin import GuardedModelAdmin

import ami.utils
Expand Down Expand Up @@ -220,7 +221,6 @@ def update_calculated_fields(self, request: HttpRequest, queryset: QuerySet[Even
self.message_user(request, f"Updated {queryset.count()} events.")

list_filter = ("deployment", "project", "start")
actions = [update_calculated_fields]


@admin.register(SourceImage)
Expand Down Expand Up @@ -262,20 +262,27 @@ class ClassificationInline(admin.TabularInline):
model = Classification
extra = 0
fields = (
"view_classification",
"taxon",
"algorithm",
"timestamp",
"terminal",
"created_at",
)
readonly_fields = (
"view_classification",
"taxon",
"algorithm",
"timestamp",
"terminal",
"created_at",
)

@admin.display(description="Classification")
def view_classification(self, obj):
url = f"/admin/main/classification/{obj.pk}/change/"
return format_html('<a href="{}">{}</a>', url, obj.pk)

def get_queryset(self, request: HttpRequest) -> QuerySet[Any]:
qs = super().get_queryset(request)
return qs.select_related("taxon", "algorithm", "detection")
Expand All @@ -285,20 +292,27 @@ class DetectionInline(admin.TabularInline):
model = Detection
extra = 0
fields = (
"view_detection",
"detection_algorithm",
"source_image",
"timestamp",
"created_at",
"occurrence",
)
readonly_fields = (
"view_detection",
"detection_algorithm",
"source_image",
"timestamp",
"created_at",
"occurrence",
)

@admin.display(description="Detection")
def view_detection(self, obj):
url = f"/admin/main/detection/{obj.pk}/change/"
return format_html('<a href="{}">{}</a>', url, obj.pk)


@admin.register(Detection)
class DetectionAdmin(admin.ModelAdmin[Detection]):
Expand Down Expand Up @@ -461,7 +475,7 @@ class TaxonAdmin(admin.ModelAdmin[Taxon]):
"created_at",
"updated_at",
)
list_filter = ("lists", "rank", TaxonParentFilter)
list_filter = ("unknown_species", "lists", "rank", TaxonParentFilter)
search_fields = ("name",)
autocomplete_fields = (
"parent",
Expand Down Expand Up @@ -594,7 +608,48 @@ def populate_collection_async(self, request: HttpRequest, queryset: QuerySet[Sou
f"Populating {len(queued_tasks)} collection(s) background tasks: {queued_tasks}.",
)

actions = [populate_collection, populate_collection_async]
@admin.action(description="Create clustering job (but don't run it)")
@admin.action()
def create_clustering_job(self, request: HttpRequest, queryset: QuerySet[SourceImageCollection]) -> None:
from ami.jobs.models import DetectionClusteringJob, Job

for collection in queryset:
job = Job.objects.create(
name=f"Clustering detections for collection {collection.pk}",
project=collection.project,
source_image_collection=collection,
job_type_key=DetectionClusteringJob.key,
params={
"ood_threshold": 0.3,
"algorithm": "agglomerative",
"algorithm_kwargs": {"distance_threshold": 80},
"pca": {"n_components": 384},
},
)
self.message_user(request, f"Created clustering job #{job.pk} for collection #{collection.pk}")

@admin.action()
def cluster_detections(self, request: HttpRequest, queryset: QuerySet[SourceImageCollection]) -> None:
for collection in queryset:
from ami.jobs.models import DetectionClusteringJob, Job

job = Job.objects.create(
name=f"Clustering detections for collection {collection.pk}",
project=collection.project,
source_image_collection=collection,
job_type_key=DetectionClusteringJob.key,
params={
"ood_threshold": 0.3,
"algorithm": "agglomerative",
"algorithm_kwargs": {"distance_threshold": 80},
"pca": {"n_components": 384},
},
)
job.enqueue()

self.message_user(request, f"Clustered {queryset.count()} collection(s).")

actions = [populate_collection, populate_collection_async, cluster_detections, create_clustering_job]

# Hide images many-to-many field from form. This would list all source images in the database.
exclude = ("images",)
11 changes: 11 additions & 0 deletions ami/main/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,7 @@ class Meta:
"last_detected",
"best_determination_score",
"cover_image_url",
"unknown_species",
"created_at",
"updated_at",
]
Expand Down Expand Up @@ -740,6 +741,8 @@ class Meta:
"fieldguide_id",
"cover_image_url",
"cover_image_credit",
"unknown_species",
"last_detected", # @TODO this has performance impact, review
]


Expand Down Expand Up @@ -1548,3 +1551,11 @@ class Meta:
"total_size",
"last_checked",
]


class ClusterDetectionsSerializer(serializers.Serializer):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel much better about using a serializer for the params! thank you 🙏

ood_threshold = serializers.FloatField(required=False, default=0.0)
feature_extraction_algorithm = serializers.CharField(required=False, allow_null=True)
algorithm = serializers.CharField(required=False, default="agglomerative")
algorithm_kwargs = serializers.DictField(required=False, default={"distance_threshold": 0.5})
pca = serializers.DictField(required=False, default={"n_components": 384})
26 changes: 24 additions & 2 deletions ami/main/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
)
from ami.base.serializers import FilterParamsSerializer, SingleParamSerializer
from ami.base.views import ProjectMixin
from ami.jobs.models import DetectionClusteringJob, Job
from ami.main.api.serializers import ClusterDetectionsSerializer
from ami.utils.requests import get_active_classification_threshold, project_id_doc_param
from ami.utils.storages import ConnectionTestResult

Expand Down Expand Up @@ -744,6 +746,27 @@ def remove(self, request, pk=None):
}
)

@action(detail=True, methods=["post"], name="cluster detections")
def cluster_detections(self, request, pk=None):
"""
Trigger a background job to cluster detections from this collection.
"""

collection: SourceImageCollection = self.get_object()
serializer = ClusterDetectionsSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
params = serializer.validated_data
job = Job.objects.create(
name=f"Clustering detections for collection {collection.pk}",
project=collection.project,
source_image_collection=collection,
job_type_key=DetectionClusteringJob.key,
params=params,
)
job.enqueue()
logger.info(f"Triggered clustering job for collection {collection.pk}")
return Response({"job_id": job.pk, "project_id": collection.project.pk})

@extend_schema(parameters=[project_id_doc_param])
def list(self, request, *args, **kwargs):
return super().list(request, *args, **kwargs)
Expand Down Expand Up @@ -1273,8 +1296,7 @@ def get_queryset(self) -> QuerySet:
project = self.get_active_project()

if project:
# Allow showing detail views for unobserved taxa
include_unobserved = True
include_unobserved = True # Show detail views for unobserved taxa instead of 404
if self.action == "list":
include_unobserved = self.request.query_params.get("include_unobserved", False)
qs = self.get_taxa_observed(qs, project, include_unobserved=include_unobserved)
Expand Down
17 changes: 17 additions & 0 deletions ami/main/migrations/0063_taxon_unknown_species.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 4.2.10 on 2025-04-28 11:11

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("main", "0062_classification_ood_score_and_more"),
]

operations = [
migrations.AddField(
model_name="taxon",
name="unknown_species",
field=models.BooleanField(default=False, help_text="Is this a clustering-generated taxon"),
),
]
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class Migration(migrations.Migration):
dependencies = [
("main", "0059_alter_project_options"),
("main", "0063_taxon_unknown_species"),
]

operations = [
Expand Down
Loading