Skip to content

Commit 7bc8747

Browse files
authored
Merge pull request #1065 from hubmapconsortium/jpuerto/workflow-description-updates
Jpuerto/workflow description updates
2 parents 12523bd + 0930bf7 commit 7bc8747

File tree

7 files changed

+49
-68
lines changed

7 files changed

+49
-68
lines changed

src/ingest-pipeline/airflow/dags/bulk_atacseq.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
) as dag:
6262
pipeline_name = "bulk-atac-seq"
6363
workflow_version = "1.0.0"
64-
workflow_description = "The bulk ATAC seq pipeline performs short read alignment to the HG38 reference genome using HISAT-2, and then calls peaks on the resulting BAM file using MACS2."
64+
workflow_description = "The bulk ATAC seq pipeline performs short read alignment to the hg38 reference genome using HISAT-2, and then calls peaks on the resulting BAM file using MACS2."
6565

6666
cwl_workflows = [
6767
{
@@ -195,13 +195,11 @@ def build_cwltool_cmd1(**kwargs):
195195
(
196196
t_log_info
197197
>> t_create_tmpdir
198-
199198
>> prepare_cwl1
200199
>> t_build_cmd1
201200
>> t_pipeline_exec
202201
>> t_maybe_keep_cwl1
203202
>> t_maybe_create_dataset
204-
205203
>> t_send_create_dataset
206204
>> t_move_data
207205
>> t_send_status

src/ingest-pipeline/airflow/dags/codex_cytokit.py

Lines changed: 16 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
) as dag:
6363
pipeline_name = "codex-pipeline"
6464
workflow_version = "1.0.0"
65-
workflow_description = "The CODEX pipeline performs illumination correction and other pre-processing steps, segments nuclei and cells using Cytokit, and performs spatial analysis of expression data using SPRM, which computes various measures of analyte intensity per cell, performs clustering based on expression and other data, and computes markers for each cluster"
65+
workflow_description = "The CODEX pipeline performs illumination correction and other pre-processing steps, segments nuclei and cells using Cytokit, and performs spatial analysis of expression data using SPRM, which computes various measures of analyte intensity per cell, performs clustering based on expression and other data, and computes markers for each cluster."
6666
steps_dir = Path(pipeline_name) / "steps"
6767

6868
cwl_workflows = [
@@ -83,9 +83,7 @@
8383
"documentation_url": "",
8484
},
8585
{
86-
"workflow_path": str(
87-
get_absolute_workflow(Path("ribca", "pipeline.cwl"))
88-
),
86+
"workflow_path": str(get_absolute_workflow(Path("ribca", "pipeline.cwl"))),
8987
"documentation_url": "",
9088
},
9189
{
@@ -264,9 +262,14 @@ def build_cwltool_cwl_ometiff_second_stitching(**kwargs):
264262
input_parameters = [
265263
{"parameter_name": "--cytokit_config", "value": str(data_dir / "experiment.yaml")},
266264
{"parameter_name": "--cytokit_output", "value": str(data_dir / "cytokit")},
267-
{"parameter_name": "--slicing_pipeline_config",
268-
"value": str(data_dir / "pipelineConfig.json"), },
269-
{"parameter_name": "--num_concurrent_tasks", "value": get_threads_resource(dag.dag_id)},
265+
{
266+
"parameter_name": "--slicing_pipeline_config",
267+
"value": str(data_dir / "pipelineConfig.json"),
268+
},
269+
{
270+
"parameter_name": "--num_concurrent_tasks",
271+
"value": get_threads_resource(dag.dag_id),
272+
},
270273
{"parameter_name": "--data_dir", "value": str(get_parent_data_dir(**kwargs))},
271274
]
272275
command = get_cwl_cmd_from_workflows(workflows, 2, input_parameters, tmpdir, kwargs["ti"])
@@ -322,9 +325,7 @@ def build_cwltool_cwl_ribca(**kwargs):
322325
key="cwl_workflows", task_ids="build_cwl_ometiff_second_stitching"
323326
)
324327

325-
input_parameters = [
326-
{"parameter_name": "--data_dir", "value": str(data_dir)}
327-
]
328+
input_parameters = [{"parameter_name": "--data_dir", "value": str(data_dir)}]
328329
command = get_cwl_cmd_from_workflows(workflows, 3, input_parameters, tmpdir, kwargs["ti"])
329330

330331
return join_quote_command_str(command)
@@ -383,9 +384,7 @@ def build_cwltool_cmd_deepcelltypes(**kwargs):
383384
data_dir = tmpdir / "cwl_out"
384385
print("data_dir: ", data_dir)
385386

386-
workflows = kwargs["ti"].xcom_pull(
387-
key="cwl_workflows", task_ids="build_cwl_ribca"
388-
)
387+
workflows = kwargs["ti"].xcom_pull(key="cwl_workflows", task_ids="build_cwl_ribca")
389388

390389
input_parameters = [
391390
{"parameter_name": "--data_dir", "value": str(data_dir)},
@@ -439,7 +438,10 @@ def build_cwltool_cmd_sprm(**kwargs):
439438
{"parameter_name": "--processes", "value": get_threads_resource(dag.dag_id)},
440439
{"parameter_name": "--image_dir", "value": str(data_dir / "pipeline_output/expr")},
441440
{"parameter_name": "--mask_dir", "value": str(data_dir / "pipeline_output/mask")},
442-
{"parameter_name": "--cell_types_directory", "value": str(data_dir / "ribca_for_sprm")},
441+
{
442+
"parameter_name": "--cell_types_directory",
443+
"value": str(data_dir / "ribca_for_sprm"),
444+
},
443445
{"parameter_name": "--cell_types_directory", "value": str(data_dir / "deepcelltypes")},
444446
]
445447

@@ -798,64 +800,52 @@ def build_cwltool_cmd_sprm_to_anndata(**kwargs):
798800
(
799801
t_log_info
800802
>> t_create_tmpdir
801-
802803
>> prepare_cwl_illumination_first_stitching
803804
>> t_build_cwl_illumination_first_stitching
804805
>> t_pipeline_exec_cwl_illumination_first_stitching
805806
>> t_maybe_keep_cwl_illumination_first_stitching
806-
807807
>> prepare_cwl_cytokit
808808
>> t_build_cwl_cytokit
809809
>> t_pipeline_exec_cwl_cytokit
810810
>> t_maybe_keep_cwl_cytokit
811-
812811
>> prepare_cwl_ometiff_second_stitching
813812
>> t_build_cwl_ometiff_second_stitching
814813
>> t_pipeline_exec_cwl_ometiff_second_stitching
815814
>> t_delete_internal_pipeline_files
816815
>> t_maybe_keep_cwl_ometiff_second_stitching
817-
818816
>> prepare_cwl_ribca
819817
>> t_build_cmd_ribca
820818
>> t_pipeline_exec_cwl_ribca
821819
>> t_maybe_keep_cwl_ribca
822-
823820
>> prepare_cwl_deepcelltypes
824821
>> t_build_cmd_deepcelltypes
825822
>> t_pipeline_exec_cwl_deepcelltypes
826823
>> t_maybe_keep_cwl_deepcelltypes
827-
828824
>> prepare_cwl_sprm
829825
>> t_build_cmd_sprm
830826
>> t_pipeline_exec_cwl_sprm
831827
>> t_maybe_keep_cwl_sprm
832-
833828
>> prepare_cwl_create_vis_symlink_archive
834829
>> t_build_cmd_create_vis_symlink_archive
835830
>> t_pipeline_exec_cwl_create_vis_symlink_archive
836831
>> t_maybe_keep_cwl_create_vis_symlink_archive
837-
838832
>> prepare_cwl_ome_tiff_pyramid
839833
>> t_build_cmd_ome_tiff_pyramid
840834
>> t_pipeline_exec_cwl_ome_tiff_pyramid
841835
>> t_maybe_keep_cwl_ome_tiff_pyramid
842-
843836
>> prepare_cwl_ome_tiff_offsets
844837
>> t_build_cmd_ome_tiff_offsets
845838
>> t_pipeline_exec_cwl_ome_tiff_offsets
846839
>> t_maybe_keep_cwl_ome_tiff_offsets
847-
848840
>> prepare_cwl_sprm_to_json
849841
>> t_build_cmd_sprm_to_json
850842
>> t_pipeline_exec_cwl_sprm_to_json
851843
>> t_maybe_keep_cwl_sprm_to_json
852-
853844
>> prepare_cwl_sprm_to_anndata
854845
>> t_build_cmd_sprm_to_anndata
855846
>> t_pipeline_exec_cwl_sprm_to_anndata
856847
>> t_maybe_keep_cwl_sprm_to_anndata
857848
>> t_maybe_create_dataset
858-
859849
>> t_send_create_dataset
860850
>> t_move_data
861851
>> t_expand_symlinks

src/ingest-pipeline/airflow/dags/multiome.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,15 @@
5252
],
5353
)
5454

55+
5556
def find_atac_metadata_file(data_dir: Path) -> Path:
5657
for path in data_dir.glob("*.tsv"):
5758
name_lower = path.name.lower()
5859
if path.is_file() and "atac" in name_lower and "metadata" in name_lower:
5960
return path
6061
raise ValueError("Couldn't find ATAC-seq metadata file")
6162

63+
6264
def generate_multiome_dag(params: MultiomeSequencingDagParameters) -> DAG:
6365
default_args = {
6466
"owner": "hubmap",
@@ -85,7 +87,7 @@ def generate_multiome_dag(params: MultiomeSequencingDagParameters) -> DAG:
8587
},
8688
) as dag:
8789
workflow_version = "1.0.0"
88-
workflow_description = "The pipeline for multiome RNA-ATACseq data uses Salmon for alignment free quasi mapping of reads from RNA sequencing to the HG38 reference genome and HISAT2 for short read alignment of ATACseq reads to the same genome. Barcodes are then mapped between components of the assay to generate an annotated data matrix with consolidated RNA and ATACseq data. This annotated data matrix is then passed to the Muon package for dimensionality reduction, clustering, and multiomic factor analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
90+
workflow_description = "The pipeline for multiome RNA-ATACseq data uses Salmon for alignment free quasi mapping of reads from RNA sequencing to the hg38 reference genome and HISAT2 for short read alignment of ATACseq reads to the same genome. Barcodes are then mapped between components of the assay to generate an annotated data matrix with consolidated RNA and ATACseq data. This annotated data matrix is then passed to the Muon package for dimensionality reduction, clustering, and multiomic factor analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
8991

9092
cwl_workflows = [
9193
{
@@ -175,7 +177,10 @@ def build_cwltool_cmd1(**kwargs):
175177
if (count := len(atac_metadata_files)) != 1:
176178
raise ValueError(f"Need 1 ATAC-seq metadata file, found {count}")
177179
input_parameters.append(
178-
{"parameter_name": "--atac_metadata_file", "value": str(atac_metadata_files[0])}
180+
{
181+
"parameter_name": "--atac_metadata_file",
182+
"value": str(atac_metadata_files[0]),
183+
}
179184
)
180185

181186
command = get_cwl_cmd_from_workflows(
@@ -191,8 +196,9 @@ def build_cwltool_cmd2(**kwargs):
191196

192197
# get organ type
193198
ds_rslt = pythonop_get_dataset_state(
194-
dataset_uuid_callable=lambda **kwargs:
195-
get_parent_dataset_uuids_list(**kwargs)[0], **kwargs)
199+
dataset_uuid_callable=lambda **kwargs: get_parent_dataset_uuids_list(**kwargs)[0],
200+
**kwargs,
201+
)
196202

197203
organ_list = list(set(ds_rslt["organs"]))
198204
organ_code = organ_list[0] if len(organ_list) == 1 else "multi"
@@ -379,23 +385,19 @@ def build_cwltool_cmd3(**kwargs):
379385
(
380386
t_log_info
381387
>> t_create_tmpdir
382-
383388
>> prepare_cwl1
384389
>> t_build_cmd1
385390
>> t_pipeline_exec
386391
>> t_maybe_keep_cwl1
387-
388392
>> prepare_cwl2
389393
>> t_build_cmd2
390394
>> t_pipeline_exec_azimuth_annotate
391395
>> t_maybe_keep_cwl2
392-
393396
>> prepare_cwl3
394397
>> t_build_cmd3
395398
>> t_convert_for_ui
396399
>> t_maybe_keep_cwl3
397400
>> t_maybe_create_dataset
398-
399401
>> t_send_create_dataset
400402
>> t_move_data
401403
>> t_send_status

src/ingest-pipeline/airflow/dags/salmon_rnaseq.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def generate_salmon_rnaseq_dag(params: SequencingDagParameters) -> DAG:
6767
},
6868
) as dag:
6969
workflow_version = "1.0.0"
70-
workflow_description = "The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
70+
workflow_description = params.workflow_description
7171

7272
cwl_workflows = [
7373
{
@@ -165,8 +165,9 @@ def build_cwltool_cmd2(**kwargs):
165165

166166
# get organ type
167167
ds_rslt = pythonop_get_dataset_state(
168-
dataset_uuid_callable=lambda **kwargs:
169-
get_parent_dataset_uuids_list(**kwargs)[0], **kwargs)
168+
dataset_uuid_callable=lambda **kwargs: get_parent_dataset_uuids_list(**kwargs)[0],
169+
**kwargs,
170+
)
170171

171172
organ_list = list(set(ds_rslt["organs"]))
172173
organ_code = organ_list[0] if len(organ_list) == 1 else "multi"
@@ -400,28 +401,23 @@ def build_cwltool_cmd4(**kwargs):
400401
(
401402
t_log_info
402403
>> t_create_tmpdir
403-
404404
>> prepare_cwl1
405405
>> t_build_cmd1
406406
>> t_pipeline_exec
407407
>> t_maybe_keep_cwl1
408-
409408
>> prepare_cwl2
410409
>> t_build_cmd2
411410
>> t_pipeline_exec_azimuth_annotate
412411
>> t_maybe_keep_cwl2
413-
414412
>> prepare_cwl3
415413
>> t_build_cmd3
416414
>> t_convert_for_ui
417415
>> t_maybe_keep_cwl3
418-
419416
>> prepare_cwl4
420417
>> t_build_cmd4
421418
>> t_convert_for_ui_2
422419
>> t_maybe_keep_cwl4
423420
>> t_maybe_create_dataset
424-
425421
>> t_send_create_dataset
426422
>> t_move_data
427423
>> t_send_status
@@ -441,11 +437,18 @@ def build_cwltool_cmd4(**kwargs):
441437
def get_salmon_dag_params(assay: str) -> SequencingDagParameters:
442438
# TODO: restructure assay names, pipeline names, etc.; this repetition
443439
# is for backward compatibility
440+
if assay == "slideseq":
441+
workflow_description = "The pipeline for slideseq data uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. SquidPy is used to provide spatial analysis, using spatial coordinates provided by data providers. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
442+
elif assay == "snareseq":
443+
workflow_description = "The pipeline for multiome RNA-ATACseq data uses Salmon for alignment free quasi mapping of reads from RNA sequencing to the hg38 reference genome and HISAT2 for short read alignment of ATACseq reads to the same genome. Barcodes are then mapped between components of the assay to generate an annotated data matrix with consolidated RNA and ATACseq data. This annotated data matrix is then passed to the Muon package for dimensionality reduction, clustering, and multiomic factor analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
444+
else:
445+
workflow_description = "The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
446+
444447
return SequencingDagParameters(
445448
dag_id=f"salmon_rnaseq_{assay}",
446449
pipeline_name=f"salmon-rnaseq-{assay}",
447450
assay=assay,
448-
workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
451+
workflow_description=workflow_description,
449452
)
450453

451454

@@ -455,25 +458,25 @@ def get_salmon_dag_params(assay: str) -> SequencingDagParameters:
455458
dag_id="salmon_rnaseq_10x",
456459
pipeline_name="salmon-rnaseq",
457460
assay="10x_v3",
458-
workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
461+
workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
459462
),
460463
SequencingDagParameters(
461464
dag_id="salmon_rnaseq_10x_sn",
462465
pipeline_name="salmon-rnaseq",
463466
assay="10x_v3_sn",
464-
workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
467+
workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
465468
),
466469
SequencingDagParameters(
467470
dag_id="salmon_rnaseq_10x_v2",
468471
pipeline_name="salmon-rnaseq",
469472
assay="10x_v2",
470-
workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
473+
workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
471474
),
472475
SequencingDagParameters(
473476
dag_id="salmon_rnaseq_10x_v2_sn",
474477
pipeline_name="salmon-rnaseq",
475478
assay="10x_v2_sn",
476-
workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
479+
workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
477480
),
478481
get_salmon_dag_params("sciseq"),
479482
get_salmon_dag_params("slideseq"),

0 commit comments

Comments
 (0)