hubmapconsortium
diff --git a/‎src/ingest-pipeline/airflow/dags/bulk_atacseq.py‎
Lines changed: 1 addition & 3 deletions b/‎src/ingest-pipeline/airflow/dags/bulk_atacseq.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/ingest-pipeline/airflow/dags/codex_cytokit.py‎
Lines changed: 16 additions & 26 deletions b/‎src/ingest-pipeline/airflow/dags/codex_cytokit.py‎
Lines changed: 16 additions & 26 deletions
diff --git a/‎src/ingest-pipeline/airflow/dags/multiome.py‎
Lines changed: 10 additions & 8 deletions b/‎src/ingest-pipeline/airflow/dags/multiome.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎src/ingest-pipeline/airflow/dags/salmon_rnaseq.py‎
Lines changed: 16 additions & 13 deletions b/‎src/ingest-pipeline/airflow/dags/salmon_rnaseq.py‎
Lines changed: 16 additions & 13 deletions
@@ -61,7 +61,7 @@
 ) as dag:
     pipeline_name = "bulk-atac-seq"
     workflow_version = "1.0.0"
-    workflow_description = "The bulk ATAC seq pipeline performs short read alignment to the HG38 reference genome using HISAT-2, and then calls peaks on the resulting BAM file using MACS2."
+    workflow_description = "The bulk ATAC seq pipeline performs short read alignment to the hg38 reference genome using HISAT-2, and then calls peaks on the resulting BAM file using MACS2."
 
     cwl_workflows = [
         {
@@ -195,13 +195,11 @@ def build_cwltool_cmd1(**kwargs):
     (
         t_log_info
         >> t_create_tmpdir
-
         >> prepare_cwl1
         >> t_build_cmd1
         >> t_pipeline_exec
         >> t_maybe_keep_cwl1
         >> t_maybe_create_dataset
-
         >> t_send_create_dataset
         >> t_move_data
         >> t_send_status
 
@@ -62,7 +62,7 @@
 ) as dag:
     pipeline_name = "codex-pipeline"
     workflow_version = "1.0.0"
-    workflow_description = "The CODEX pipeline performs illumination correction and other pre-processing steps, segments nuclei and cells using Cytokit, and performs spatial analysis of expression data using SPRM, which computes various measures of analyte intensity per cell, performs clustering based on expression and other data, and computes markers for each cluster"
+    workflow_description = "The CODEX pipeline performs illumination correction and other pre-processing steps, segments nuclei and cells using Cytokit, and performs spatial analysis of expression data using SPRM, which computes various measures of analyte intensity per cell, performs clustering based on expression and other data, and computes markers for each cluster."
     steps_dir = Path(pipeline_name) / "steps"
 
     cwl_workflows = [
@@ -83,9 +83,7 @@
             "documentation_url": "",
         },
         {
-            "workflow_path": str(
-                get_absolute_workflow(Path("ribca", "pipeline.cwl"))
-            ),
+            "workflow_path": str(get_absolute_workflow(Path("ribca", "pipeline.cwl"))),
             "documentation_url": "",
         },
         {
@@ -264,9 +262,14 @@ def build_cwltool_cwl_ometiff_second_stitching(**kwargs):
         input_parameters = [
             {"parameter_name": "--cytokit_config", "value": str(data_dir / "experiment.yaml")},
             {"parameter_name": "--cytokit_output", "value": str(data_dir / "cytokit")},
-            {"parameter_name": "--slicing_pipeline_config",
-             "value": str(data_dir / "pipelineConfig.json"), },
-            {"parameter_name": "--num_concurrent_tasks", "value": get_threads_resource(dag.dag_id)},
+            {
+                "parameter_name": "--slicing_pipeline_config",
+                "value": str(data_dir / "pipelineConfig.json"),
+            },
+            {
+                "parameter_name": "--num_concurrent_tasks",
+                "value": get_threads_resource(dag.dag_id),
+            },
             {"parameter_name": "--data_dir", "value": str(get_parent_data_dir(**kwargs))},
         ]
         command = get_cwl_cmd_from_workflows(workflows, 2, input_parameters, tmpdir, kwargs["ti"])
@@ -322,9 +325,7 @@ def build_cwltool_cwl_ribca(**kwargs):
             key="cwl_workflows", task_ids="build_cwl_ometiff_second_stitching"
         )
 
-        input_parameters = [
-            {"parameter_name": "--data_dir", "value": str(data_dir)}
-        ]
+        input_parameters = [{"parameter_name": "--data_dir", "value": str(data_dir)}]
         command = get_cwl_cmd_from_workflows(workflows, 3, input_parameters, tmpdir, kwargs["ti"])
 
         return join_quote_command_str(command)
@@ -383,9 +384,7 @@ def build_cwltool_cmd_deepcelltypes(**kwargs):
         data_dir = tmpdir / "cwl_out"
         print("data_dir: ", data_dir)
 
-        workflows = kwargs["ti"].xcom_pull(
-            key="cwl_workflows", task_ids="build_cwl_ribca"
-        )
+        workflows = kwargs["ti"].xcom_pull(key="cwl_workflows", task_ids="build_cwl_ribca")
 
         input_parameters = [
             {"parameter_name": "--data_dir", "value": str(data_dir)},
@@ -439,7 +438,10 @@ def build_cwltool_cmd_sprm(**kwargs):
             {"parameter_name": "--processes", "value": get_threads_resource(dag.dag_id)},
             {"parameter_name": "--image_dir", "value": str(data_dir / "pipeline_output/expr")},
             {"parameter_name": "--mask_dir", "value": str(data_dir / "pipeline_output/mask")},
-            {"parameter_name": "--cell_types_directory", "value": str(data_dir / "ribca_for_sprm")},
+            {
+                "parameter_name": "--cell_types_directory",
+                "value": str(data_dir / "ribca_for_sprm"),
+            },
             {"parameter_name": "--cell_types_directory", "value": str(data_dir / "deepcelltypes")},
         ]
 
@@ -798,64 +800,52 @@ def build_cwltool_cmd_sprm_to_anndata(**kwargs):
     (
         t_log_info
         >> t_create_tmpdir
-
         >> prepare_cwl_illumination_first_stitching
         >> t_build_cwl_illumination_first_stitching
         >> t_pipeline_exec_cwl_illumination_first_stitching
         >> t_maybe_keep_cwl_illumination_first_stitching
-
         >> prepare_cwl_cytokit
         >> t_build_cwl_cytokit
         >> t_pipeline_exec_cwl_cytokit
         >> t_maybe_keep_cwl_cytokit
-
         >> prepare_cwl_ometiff_second_stitching
         >> t_build_cwl_ometiff_second_stitching
         >> t_pipeline_exec_cwl_ometiff_second_stitching
         >> t_delete_internal_pipeline_files
         >> t_maybe_keep_cwl_ometiff_second_stitching
-
         >> prepare_cwl_ribca
         >> t_build_cmd_ribca
         >> t_pipeline_exec_cwl_ribca
         >> t_maybe_keep_cwl_ribca
-
         >> prepare_cwl_deepcelltypes
         >> t_build_cmd_deepcelltypes
         >> t_pipeline_exec_cwl_deepcelltypes
         >> t_maybe_keep_cwl_deepcelltypes
-
         >> prepare_cwl_sprm
         >> t_build_cmd_sprm
         >> t_pipeline_exec_cwl_sprm
         >> t_maybe_keep_cwl_sprm
-
         >> prepare_cwl_create_vis_symlink_archive
         >> t_build_cmd_create_vis_symlink_archive
         >> t_pipeline_exec_cwl_create_vis_symlink_archive
         >> t_maybe_keep_cwl_create_vis_symlink_archive
-
         >> prepare_cwl_ome_tiff_pyramid
         >> t_build_cmd_ome_tiff_pyramid
         >> t_pipeline_exec_cwl_ome_tiff_pyramid
         >> t_maybe_keep_cwl_ome_tiff_pyramid
-
         >> prepare_cwl_ome_tiff_offsets
         >> t_build_cmd_ome_tiff_offsets
         >> t_pipeline_exec_cwl_ome_tiff_offsets
         >> t_maybe_keep_cwl_ome_tiff_offsets
-
         >> prepare_cwl_sprm_to_json
         >> t_build_cmd_sprm_to_json
         >> t_pipeline_exec_cwl_sprm_to_json
         >> t_maybe_keep_cwl_sprm_to_json
-
         >> prepare_cwl_sprm_to_anndata
         >> t_build_cmd_sprm_to_anndata
         >> t_pipeline_exec_cwl_sprm_to_anndata
         >> t_maybe_keep_cwl_sprm_to_anndata
         >> t_maybe_create_dataset
-
         >> t_send_create_dataset
         >> t_move_data
         >> t_expand_symlinks
 
@@ -52,13 +52,15 @@
     ],
 )
 
+
 def find_atac_metadata_file(data_dir: Path) -> Path:
     for path in data_dir.glob("*.tsv"):
         name_lower = path.name.lower()
         if path.is_file() and "atac" in name_lower and "metadata" in name_lower:
             return path
     raise ValueError("Couldn't find ATAC-seq metadata file")
 
+
 def generate_multiome_dag(params: MultiomeSequencingDagParameters) -> DAG:
     default_args = {
         "owner": "hubmap",
@@ -85,7 +87,7 @@ def generate_multiome_dag(params: MultiomeSequencingDagParameters) -> DAG:
         },
     ) as dag:
         workflow_version = "1.0.0"
-        workflow_description = "The pipeline for multiome RNA-ATACseq data uses Salmon for alignment free quasi mapping of reads from RNA sequencing to the HG38 reference genome and HISAT2 for short read alignment of ATACseq reads to the same genome.  Barcodes are then mapped between components of the assay to generate an annotated data matrix with consolidated RNA and ATACseq data.  This annotated data matrix is then passed to the Muon package for dimensionality reduction, clustering, and multiomic factor analysis.  Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
+        workflow_description = "The pipeline for multiome RNA-ATACseq data uses Salmon for alignment free quasi mapping of reads from RNA sequencing to the hg38 reference genome and HISAT2 for short read alignment of ATACseq reads to the same genome.  Barcodes are then mapped between components of the assay to generate an annotated data matrix with consolidated RNA and ATACseq data.  This annotated data matrix is then passed to the Muon package for dimensionality reduction, clustering, and multiomic factor analysis.  Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
 
         cwl_workflows = [
             {
@@ -175,7 +177,10 @@ def build_cwltool_cmd1(**kwargs):
                 if (count := len(atac_metadata_files)) != 1:
                     raise ValueError(f"Need 1 ATAC-seq metadata file, found {count}")
                 input_parameters.append(
-                    {"parameter_name": "--atac_metadata_file", "value": str(atac_metadata_files[0])}
+                    {
+                        "parameter_name": "--atac_metadata_file",
+                        "value": str(atac_metadata_files[0]),
+                    }
                 )
 
             command = get_cwl_cmd_from_workflows(
@@ -191,8 +196,9 @@ def build_cwltool_cmd2(**kwargs):
 
             # get organ type
             ds_rslt = pythonop_get_dataset_state(
-                dataset_uuid_callable=lambda **kwargs:
-                get_parent_dataset_uuids_list(**kwargs)[0], **kwargs)
+                dataset_uuid_callable=lambda **kwargs: get_parent_dataset_uuids_list(**kwargs)[0],
+                **kwargs,
+            )
 
             organ_list = list(set(ds_rslt["organs"]))
             organ_code = organ_list[0] if len(organ_list) == 1 else "multi"
@@ -379,23 +385,19 @@ def build_cwltool_cmd3(**kwargs):
         (
             t_log_info
             >> t_create_tmpdir
-
             >> prepare_cwl1
             >> t_build_cmd1
             >> t_pipeline_exec
             >> t_maybe_keep_cwl1
-
             >> prepare_cwl2
             >> t_build_cmd2
             >> t_pipeline_exec_azimuth_annotate
             >> t_maybe_keep_cwl2
-
             >> prepare_cwl3
             >> t_build_cmd3
             >> t_convert_for_ui
             >> t_maybe_keep_cwl3
             >> t_maybe_create_dataset
-
             >> t_send_create_dataset
             >> t_move_data
             >> t_send_status
 
@@ -67,7 +67,7 @@ def generate_salmon_rnaseq_dag(params: SequencingDagParameters) -> DAG:
         },
     ) as dag:
         workflow_version = "1.0.0"
-        workflow_description = "The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
+        workflow_description = params.workflow_description
 
         cwl_workflows = [
             {
@@ -165,8 +165,9 @@ def build_cwltool_cmd2(**kwargs):
 
             # get organ type
             ds_rslt = pythonop_get_dataset_state(
-                    dataset_uuid_callable=lambda **kwargs:
-                    get_parent_dataset_uuids_list(**kwargs)[0], **kwargs)
+                dataset_uuid_callable=lambda **kwargs: get_parent_dataset_uuids_list(**kwargs)[0],
+                **kwargs,
+            )
 
             organ_list = list(set(ds_rslt["organs"]))
             organ_code = organ_list[0] if len(organ_list) == 1 else "multi"
@@ -400,28 +401,23 @@ def build_cwltool_cmd4(**kwargs):
         (
             t_log_info
             >> t_create_tmpdir
-
             >> prepare_cwl1
             >> t_build_cmd1
             >> t_pipeline_exec
             >> t_maybe_keep_cwl1
-
             >> prepare_cwl2
             >> t_build_cmd2
             >> t_pipeline_exec_azimuth_annotate
             >> t_maybe_keep_cwl2
-
             >> prepare_cwl3
             >> t_build_cmd3
             >> t_convert_for_ui
             >> t_maybe_keep_cwl3
-
             >> prepare_cwl4
             >> t_build_cmd4
             >> t_convert_for_ui_2
             >> t_maybe_keep_cwl4
             >> t_maybe_create_dataset
-
             >> t_send_create_dataset
             >> t_move_data
             >> t_send_status
@@ -441,11 +437,18 @@ def build_cwltool_cmd4(**kwargs):
 def get_salmon_dag_params(assay: str) -> SequencingDagParameters:
     # TODO: restructure assay names, pipeline names, etc.; this repetition
     #   is for backward compatibility
+    if assay == "slideseq":
+        workflow_description = "The pipeline for slideseq data uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. SquidPy is used to provide spatial analysis, using spatial coordinates provided by data providers.  Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
+    elif assay == "snareseq":
+        workflow_description = "The pipeline for multiome RNA-ATACseq data uses Salmon for alignment free quasi mapping of reads from RNA sequencing to the hg38 reference genome and HISAT2 for short read alignment of ATACseq reads to the same genome.  Barcodes are then mapped between components of the assay to generate an annotated data matrix with consolidated RNA and ATACseq data.  This annotated data matrix is then passed to the Muon package for dimensionality reduction, clustering, and multiomic factor analysis.  Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
+    else:
+        workflow_description = "The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed."
+
     return SequencingDagParameters(
         dag_id=f"salmon_rnaseq_{assay}",
         pipeline_name=f"salmon-rnaseq-{assay}",
         assay=assay,
-        workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
+        workflow_description=workflow_description,
     )
 
 
@@ -455,25 +458,25 @@ def get_salmon_dag_params(assay: str) -> SequencingDagParameters:
         dag_id="salmon_rnaseq_10x",
         pipeline_name="salmon-rnaseq",
         assay="10x_v3",
-        workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
+        workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
     ),
     SequencingDagParameters(
         dag_id="salmon_rnaseq_10x_sn",
         pipeline_name="salmon-rnaseq",
         assay="10x_v3_sn",
-        workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
+        workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
     ),
     SequencingDagParameters(
         dag_id="salmon_rnaseq_10x_v2",
         pipeline_name="salmon-rnaseq",
         assay="10x_v2",
-        workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
+        workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
     ),
     SequencingDagParameters(
         dag_id="salmon_rnaseq_10x_v2_sn",
         pipeline_name="salmon-rnaseq",
         assay="10x_v2_sn",
-        workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the HG38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
+        workflow_description="The pipeline for scRNA/snRNA data with whole transcriptome sequencing results uses Salmon alevin for alignment free quasimapping to the hg38 reference genome and converts the resulting capture bead by gene matrix to the h5ad format, which is used by ScanPy for downstream analysis including dimensionality reduction, unsupervised clustering, and differential expression analysis. Cell type annotations are provided by Azimuth when available for the type of tissue being analyzed.",
     ),
     get_salmon_dag_params("sciseq"),
     get_salmon_dag_params("slideseq"),