Skip to content

Commit 6f6fdb9

Browse files
authored
Merge branch 'main' into feature/coassembly-schema
2 parents e99cf68 + 48c6069 commit 6f6fdb9

File tree

7 files changed

+400
-131
lines changed

7 files changed

+400
-131
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,5 @@ genomes/temp/*
3030
slurm-dev-environment/fs/nfs/public/tests/assembly_v6_output/ERP106708/MGYS*
3131
slurm-dev-environment/fs/nfs/public/tests/amplicon_v6_output/dwca/*
3232
.pytest-cache/
33+
.claude/*
34+
CLAUDE.md

.talismanrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ fileignoreconfig:
172172
checksum: ac644b851d31913ca9328d5a83516146c4c76874201df0a2d7086df6b60435c8
173173

174174
- filename: workflows/tests/test_analysis_rawreads_study_flow.py
175-
checksum: 0f7255bdd7303269337a7d9be17d5246bfc32980697ba4135f54be201d13180a
175+
checksum: 211d4cfcd3e1b0dbe6e8b92683f6160661c50a17429d470ea0651daa41f5c3f8
176176

177177
- filename: workflows/flows/upload_assembly.py
178178
allowed_patterns: [key]

deployment/ebi-wp-k8s-hl/ebi-wp-k8s-hl.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,7 @@ items:
470470
periodSeconds: 600
471471
initContainers:
472472
- name: webclient-build
473-
image: quay.io/microbiome-informatics/node:22
473+
image: quay.io/microbiome-informatics/ebi-metagenomics-client@sha256:3219c806f92dd0d5a791648a1fc95abe5bb7784660c2a7c042cbc6916a8c05ff
474474
imagePullPolicy: IfNotPresent
475475
resources:
476476
requests:

workflows/flows/analyse_study_tasks/raw_reads/run_rawreads_pipeline_via_samplesheet.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def run_rawreads_pipeline_via_samplesheet(
4444
rawreads_analysis_ids: List[Union[str, int]],
4545
workdir: Optional[Path],
4646
outdir: Optional[Path],
47+
functional_analysis: bool = False,
4748
):
4849
if workdir is None:
4950
workdir = (
@@ -94,6 +95,7 @@ def run_rawreads_pipeline_via_samplesheet(
9495
("--outdir", nextflow_outdir),
9596
EMG_CONFIG.slurm.use_nextflow_tower and "-with-tower",
9697
EMG_CONFIG.rawreads_pipeline.has_fire_access and "--use_fire_download",
98+
("--skip_functional", "false" if functional_analysis else "true"),
9799
("-work-dir", nextflow_workdir),
98100
("-ansi-log", "false"),
99101
]

workflows/flows/analysis_rawreads_study.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ class AnalyseStudyInput(RunInput):
103103
ENALibraryStrategyPolicy.ONLY_IF_CORRECT_IN_ENA,
104104
description="Optionally treat read-runs with incorrect library strategy metadata as raw-reads.",
105105
)
106+
functional_analysis: bool = Field(
107+
False,
108+
description="Enable functional analysis in the raw-reads pipeline.",
109+
)
106110
webin_owner: Optional[str] = Field(
107111
None,
108112
description="Webin ID of study owner, if data is private. Can be left as None, if public.",
@@ -208,7 +212,11 @@ class AnalyseStudyInput(RunInput):
208212
f"Working on raw-reads analyses: {analyses_chunk[0]}-{analyses_chunk[-1]}"
209213
)
210214
run_rawreads_pipeline_via_samplesheet(
211-
mgnify_study, analyses_chunk, study_workdir, study_outdir
215+
mgnify_study,
216+
analyses_chunk,
217+
study_workdir,
218+
study_outdir,
219+
functional_analysis=analyse_study_input.functional_analysis,
212220
)
213221

214222
merge_study_summaries(

workflows/prefect_deployments/prefect-ebi-codon.yaml

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,15 @@ deployments:
2222
job_variables: {}
2323
schedules: []
2424

25+
- name: analysis_amplicon_study_deployment
26+
description: |-
27+
Get a study from ENA, and input it to MGnify.
28+
Kick off amplicon-v6 pipeline.
29+
:param study_accession: Study accession e.g. PRJxxxxxx
30+
entrypoint: workflows/flows/analysis_amplicon_study.py:analysis_amplicon_study
31+
work_pool:
32+
name: slurm
33+
2534
- name: analysis_assembly_study_deployment
2635
description: |-
2736
Get a study from ENA (or MGnify), and run assembly analysis the assemblies of the study.
@@ -177,3 +186,107 @@ deployments:
177186
entrypoint: workflows/flows/nf_traces/flows.py:nextflow_trace_etl_flow
178187
work_pool:
179188
name: slurm
189+
190+
- name: analysis_rawreads_study_deployment
191+
description: |-
192+
Get a study from ENA, and input it to MGnify.
193+
Kick off raw-reads-v6 pipeline.
194+
:param study_accession: Study accession e.g. PRJxxxxxx
195+
entrypoint: workflows/flows/analysis_rawreads_study.py:analysis_rawreads_study
196+
work_pool:
197+
name: slurm
198+
199+
- name: import_genomes_flow_deployment
200+
description: |-
201+
Imports genomes from a catalogue directory into the database.
202+
203+
This flow processes genome results from a catalogue directory, performs sanity checks,
204+
and imports genome data including annotations, files, and metadata.
205+
206+
:param results_directory: Path to the catalogue directory containing genome results
207+
:param catalogue_name: Name of the genome catalogue
208+
:param catalogue_version: Version of the genome catalogue
209+
:param gold_biome: Biome classification for the catalogue
210+
:param pipeline_version: Version of the pipeline used to generate the genomes
211+
:param catalogue_type: Type of catalogue (e.g., prokaryotes, eukaryotes)
212+
:param catalogue_biome_label: Optional label for the catalogue biome
213+
entrypoint: workflows/flows/import_genomes_flow.py:import_genomes_flow
214+
work_pool:
215+
name: slurm
216+
217+
- name: import_genome_assembly_links_flow_deployment
218+
description: |-
219+
Imports data from a TSV file into the GenomeAssemblyLink model.
220+
221+
This flow processes a TSV file containing genome assembly link information and
222+
imports it into the database, creating relationships between MAGs, genomes, and species representatives.
223+
224+
:param tsv_path: Path to the TSV file containing genome assembly link data
225+
entrypoint: workflows/flows/import_genome_assembly_links_flow.py:import_genome_assembly_links_flow
226+
work_pool:
227+
name: slurm
228+
229+
- name: import_additional_contained_genomes_flow_deployment
230+
description: |-
231+
Imports data from a large TSV file into the AdditionalContainedGenomes model.
232+
233+
The TSV must contain the following columns:
234+
- Run
235+
- Genome_Mgnify_accession
236+
- Containment
237+
- cANI
238+
239+
The flow reads the file in streaming chunks and performs batched DB operations.
240+
241+
:param csv_path: Path to the TSV file containing additional contained genomes data
242+
:param chunk_size: Size of chunks to read from the file (default: 50000)
243+
:param insert_batch_size: Size of batches for database insertion (default: 10000)
244+
entrypoint: workflows/flows/import_additional_contained_genomes_flow.py:import_additional_contained_genomes_flow
245+
work_pool:
246+
name: slurm
247+
248+
- name: update_ena_accession_from_json_flow_deployment
249+
description: |-
250+
Traverse per-genome JSON files to update Genome.ena_genome_accession from the
251+
'ncbi_genome_accession' value found in each file.
252+
253+
:param base_dir: Directory containing one subdirectory per genome accession, each with
254+
a JSON file named <accession>.json
255+
:param read_chunk_size: Django iterator chunk size when scanning genomes (default: 5000)
256+
:param update_batch_size: Number of rows to bulk update at once (default: 2000)
257+
:param catalogue_name: Optional; if provided, restrict processing to genomes whose
258+
catalogue has this exact name
259+
entrypoint: workflows/flows/update_ena_accession_from_json_flow.py:update_ena_accession_from_json_flow
260+
work_pool:
261+
name: slurm
262+
263+
- name: upload_assembly_deployment
264+
description: |-
265+
This flow performs a sanity check and uploads an assembly for a specific run to ENA.
266+
267+
It is intended to be executed *per run* after the assembly flow. The assembly uploader
268+
is a separate python library to prepare the upload files. The assembly submission
269+
via `webin-cli` is launched as a SLURM cluster job.
270+
271+
:param assembly_id: ID of the assembly to upload
272+
:param dry_run: If True, perform a dry run without actual upload (default: True)
273+
:param custom_upload_folder: Optional custom path for upload folder
274+
entrypoint: workflows/flows/upload_assembly.py:upload_assembly
275+
work_pool:
276+
name: slurm
277+
278+
- name: move_data_deployment
279+
description: |-
280+
Move files on the cluster filesystem.
281+
282+
This flow uses a slurm job running on the datamover partition to move files
283+
between locations on the cluster filesystem.
284+
285+
:param source: fully qualified path of the source location (file or folder)
286+
:param target: fully qualified path of the target location (file or folder)
287+
:param move_command: tool command for the move. Default is `cp`, but could be `mv` or `rsync` etc.
288+
:param make_target: mkdir the target location path before copying.
289+
:param kwargs: Other keywords to pass to run_cluster_job
290+
entrypoint: workflows.prefect_utils.datamovers:move_data
291+
work_pool:
292+
name: slurm

0 commit comments

Comments
 (0)