import legacy co-assemblies

SandyRogers · SandyRogers · commit f3d72ed115b1 · 2026-03-02T11:49:15.000Z
diff --git a/workflows/flows/legacy/flows/import_v5_analyses.py b/workflows/flows/legacy/flows/import_v5_analyses.py
@@ -9,6 +9,9 @@
     sync_study_metadata_from_ena,
     sync_sample_metadata_from_ena,
 )
+from workflows.flows.legacy.tasks.make_assembly_from_legacy_emg_db import (
+    make_assembly_from_legacy_emg_db,
+)
 from workflows.flows.legacy.tasks.make_run_from_legacy_emg_db import (
     make_run_from_legacy_emg_db,
 )
@@ -71,6 +74,15 @@ def import_v5_analyses(mgys: str):
             sync_sample_metadata_from_ena(sample.ena_sample)
             run = make_run_from_legacy_emg_db(legacy_analysis.run, study)
 
+            assembly = None
+            if legacy_analysis.experiment_type_id in (4, 7, 8):  # Assembly types
+                assembly = make_assembly_from_legacy_emg_db(
+                    legacy_analysis.secondary_accession,
+                    legacy_analysis.result_directory,
+                    study,
+                    sample,
+                )
+
             analysis, created = Analysis.objects.update_or_create(
                 id=legacy_analysis.job_id,
                 defaults={
@@ -80,6 +92,7 @@ def import_v5_analyses(mgys: str):
                     "ena_study": study.ena_study,
                     "pipeline_version": Analysis.PipelineVersions.v5,
                     "run": run,
+                    "assembly": assembly,
                 },
             )
             analysis.inherit_experiment_type()
diff --git a/workflows/flows/legacy/tasks/make_assembly_from_legacy_emg_db.py b/workflows/flows/legacy/tasks/make_assembly_from_legacy_emg_db.py
@@ -0,0 +1,86 @@
+from prefect import task, get_run_logger
+from sqlalchemy import select
+
+from analyses.models import Study, Sample, Assembly
+from workflows.data_io_utils.legacy_emg_dbs import (
+    LegacyAssembly,
+    LegacySample,
+    LegacyAssemblySample,
+    LegacyRun,
+    LegacyAssemblyRun,
+)
+from workflows.ena_utils.ena_api_requests import sync_sample_metadata_from_ena
+from workflows.flows.legacy.tasks.make_run_from_legacy_emg_db import (
+    make_run_from_legacy_emg_db,
+)
+from workflows.flows.legacy.tasks.make_sample_from_legacy_emg_db import (
+    make_sample_from_legacy_emg_db,
+)
+
+
+@task
+def make_assembly_from_legacy_emg_db(
+    legacy_analysis_secondary_accession: str,
+    legacy_analysis_result_directory: str,
+    study: Study,
+    sample: Sample,
+) -> Assembly | None:
+    from workflows.data_io_utils.legacy_emg_dbs import legacy_emg_db_session
+
+    logger = get_run_logger()
+
+    with legacy_emg_db_session() as session:
+        # Try to find a co-assembly/assembly in the legacy DB
+        legacy_assembly_stmt = select(LegacyAssembly).where(
+            LegacyAssembly.accession == legacy_analysis_secondary_accession
+        )
+        legacy_assembly: LegacyAssembly = session.scalar(legacy_assembly_stmt)
+
+        if not legacy_assembly:
+            return None
+
+        # In this new schema, Assembly has a sample field and runs ManyToMany.
+        # A co-assembly might have multiple runs and samples.
+        # For now we use the analysis job's primary sample for the assembly.
+        assembly, created = Assembly.objects.get_or_create(
+            ena_study=study.ena_study,
+            dir=legacy_analysis_result_directory,
+            sample=sample,
+            defaults={
+                "ena_accessions": [legacy_assembly.accession],
+                "reads_study": study,
+            },
+        )
+        if created:
+            logger.info(f"Created new Assembly object {assembly}")
+
+        # Ensure all samples linked to this assembly exist in the new DB
+        # before creating the runs.
+        legacy_samples_stmt = (
+            select(LegacySample)
+            .join(
+                LegacyAssemblySample,
+                LegacySample.sample_id == LegacyAssemblySample.sample_id,
+            )
+            .where(LegacyAssemblySample.assembly_id == legacy_assembly.assembly_id)
+        )
+        legacy_samples = session.scalars(legacy_samples_stmt).unique().all()
+        for leg_sample in legacy_samples:
+            s = make_sample_from_legacy_emg_db(leg_sample, study)
+            sync_sample_metadata_from_ena(s.ena_sample)
+
+        # Link all runs associated with this legacy assembly
+        legacy_runs_stmt = (
+            select(LegacyRun)
+            .join(
+                LegacyAssemblyRun,
+                LegacyRun.run_id == LegacyAssemblyRun.run_id,
+            )
+            .where(LegacyAssemblyRun.assembly_id == legacy_assembly.assembly_id)
+        )
+        legacy_runs = session.scalars(legacy_runs_stmt).unique().all()
+        for leg_run in legacy_runs:
+            r = make_run_from_legacy_emg_db(leg_run, study)
+            assembly.runs.add(r)
+
+    return assembly
diff --git a/workflows/tests/test_coassembly_import.py b/workflows/tests/test_coassembly_import.py
@@ -0,0 +1,163 @@
+import re
+
+import pytest
+
+from analyses.models import Analysis, Run
+from workflows.data_io_utils.legacy_emg_dbs import (
+    LegacyStudy,
+    LegacySample,
+    LegacyRun,
+    LegacyAnalysisJob,
+    LegacyAssembly,
+    LegacyAssemblyRun,
+    LegacyAssemblySample,
+)
+from workflows.flows.legacy.flows.import_v5_analyses import import_v5_analyses
+from workflows.prefect_utils.testing_utils import (
+    run_flow_and_capture_logs,
+    should_not_mock_httpx_requests_to_prefect_server,
+)
+
+
+@pytest.fixture
+def coassembly_legacy_db(in_memory_legacy_emg_db):
+    with in_memory_legacy_emg_db as session:
+        # Add a co-assembly study (ID 6000)
+        study = LegacyStudy(
+            id=6000,
+            centre_name="COASSEMBLY",
+            study_name="Co-assembly study",
+            ext_study_id="ERP6000",
+            is_private=False,
+            submission_account_id="Webin-6000",
+            project_id="PRJ6000",
+            is_suppressed=False,
+            biome_id=1,
+        )
+        session.add(study)
+
+        # Two samples
+        sample1 = LegacySample(
+            sample_id=6001, ext_sample_id="ERS6001", primary_accession="SAMEA6001"
+        )
+        sample2 = LegacySample(
+            sample_id=6002, ext_sample_id="ERS6002", primary_accession="SAMEA6002"
+        )
+        session.add_all([sample1, sample2])
+
+        # Two runs
+        run1 = LegacyRun(
+            run_id=6001,
+            sample_id=6001,
+            accession="ERR6001",
+            experiment_type_id=4,
+            secondary_accession="ERR6001",
+            study_id=6000,
+            instrument_platform="Illumina",
+            instrument_model="HiSeq",
+        )
+        run2 = LegacyRun(
+            run_id=6002,
+            sample_id=6002,
+            accession="ERR6002",
+            experiment_type_id=4,
+            secondary_accession="ERR6002",
+            study_id=6000,
+            instrument_platform="Illumina",
+            instrument_model="HiSeq",
+        )
+        session.add_all([run1, run2])
+
+        # One assembly
+        assembly = LegacyAssembly(
+            assembly_id=6001, accession="ERZ6001", study_id=6000, experiment_type_id=4
+        )
+        session.add(assembly)
+
+        # Links
+        session.add(
+            LegacyAssemblyRun(assembly_run_id=6001, assembly_id=6001, run_id=6001)
+        )
+        session.add(
+            LegacyAssemblyRun(assembly_run_id=6002, assembly_id=6001, run_id=6002)
+        )
+        session.add(
+            LegacyAssemblySample(
+                assembly_sample_id=6001, assembly_id=6001, sample_id=6001
+            )
+        )
+        session.add(
+            LegacyAssemblySample(
+                assembly_sample_id=6002, assembly_id=6001, sample_id=6002
+            )
+        )
+
+        # Analysis job for the co-assembly
+        analysis = LegacyAnalysisJob(
+            job_id=66666,
+            sample_id=6001,
+            run_id=6001,
+            study_id=6000,
+            pipeline_id=6,
+            result_directory="coassembly/results",
+            external_run_ids="ERR6001,ERR6002",
+            secondary_accession="ERZ6001",
+            experiment_type_id=4,
+            analysis_status_id=3,
+        )
+        session.add(analysis)
+        session.commit()
+    return in_memory_legacy_emg_db
+
+
+@pytest.mark.httpx_mock(should_mock=should_not_mock_httpx_requests_to_prefect_server)
+@pytest.mark.django_db(transaction=True)
+def test_import_coassembly(
+    prefect_harness,
+    coassembly_legacy_db,
+    mock_legacy_emg_db_session,
+    mock_mongo_client_for_taxonomy_and_protein_functions,
+    httpx_mock,
+):
+    httpx_mock.add_response(
+        url=re.compile(r".*result=study.*ERP6000.*"),
+        json=[{"study_accession": "ERP6000"}],
+        is_reusable=True,
+    )
+    # Mock ENA sample metadata
+    httpx_mock.add_response(
+        url=re.compile(r".*result=sample.*SAMEA6001.*"),
+        json=[{"sample_accession": "SAMEA6001"}],
+        is_reusable=True,
+    )
+    httpx_mock.add_response(
+        url=re.compile(r".*result=sample.*SAMEA6002.*"),
+        json=[{"sample_accession": "SAMEA6002"}],
+        is_reusable=True,
+    )
+
+    run_flow_and_capture_logs(
+        import_v5_analyses,
+        mgys="MGYS00006000",
+    )
+
+    analysis = Analysis.objects.get(id=66666)
+    assert analysis.assembly is not None
+    # ERZ is usually in ena_accessions
+    assert "ERZ6001" in analysis.assembly.ena_accessions
+
+    # Check that the assembly is linked to both runs
+    runs = analysis.assembly.runs.all()
+    assert runs.count() == 2
+    run_accessions = {r.first_accession for r in runs}
+    assert run_accessions == {"ERR6001", "ERR6002"}
+
+    # Check that the assembly is linked to correct samples...
+    # At the moment we expect only just first sample, as assemblies are transitioning to single virtual samples
+    assert analysis.assembly.sample.first_accession == "SAMEA6001"
+    assert not analysis.assembly.sample.related_samples.exists()
+    # The other sample should still be linked via its run though
+    assert (
+        Run.objects.get(ena_accessions__contains=["ERR6002"]).sample.first_accession
+        == "SAMEA6002"
+    )