10x Adapter WDL (#83)

samanehsan · web-flow · commit 302eb3fb2d3c · 2018-10-05T16:16:12.000-04:00
* Add adapter wdl for cellranger 10x wdl

* Rename fastqs cellranger inputs

* Change docker image for testing

* Test renaming files

* Fix renaming files

* Use sample name instead of id

* Retest

* Fix getting file names

* Handle case where expect_cells is not defined

* Add cellranger inputs, options and readme files

* Style changes

Also use bash instead of python
for renaming files

* Update pipeline_tools_version tag

* Update cellranger reference
diff --git a/adapter_pipelines/Optimus/adapter.wdl b/adapter_pipelines/Optimus/adapter.wdl
@@ -127,7 +127,7 @@ workflow AdapterOptimus {
   Int max_cromwell_retries = 0
   Boolean add_md5s = false
 
-  String pipeline_tools_version = "v0.29.0"
+  String pipeline_tools_version = "v0.32.0"
 
   call GetInputs as prep {
     input:
diff --git a/adapter_pipelines/cellranger/README.md b/adapter_pipelines/cellranger/README.md
@@ -0,0 +1,14 @@
+# Overview
+
+This directory contains an adapter pipeline used by the Secondary Analysis Service to run the CellRanger count pipeline
+
+# Files
+
+* adapter.wdl
+The adapter pipeline, which parses a bundle manifest from the Data Storage Service, runs the CellRanger count analysis pipeline, then runs the submission pipeline to submit the results to the Ingest Service.
+
+* adapter_example_static.json
+Example inputs to use when running this pipeline that stay the same for every run.
+
+* options.json
+Options file to use when running workflow.
diff --git a/adapter_pipelines/cellranger/adapter.wdl b/adapter_pipelines/cellranger/adapter.wdl
@@ -0,0 +1,260 @@
+import "cellranger.wdl" as CellRanger
+import "submit.wdl" as submit_wdl
+
+
+task GetInputs {
+  String bundle_uuid
+  String bundle_version
+  String dss_url
+  Int? retry_timeout
+  Float? retry_multiplier
+  Int? retry_max_interval
+  Int? individual_request_timeout
+  Boolean record_http
+  String pipeline_tools_version
+
+  command <<<
+    export RECORD_HTTP_REQUESTS="${record_http}"
+    export RETRY_TIMEOUT="${retry_timeout}"
+    export RETRY_MULTIPLIER="${retry_multiplier}"
+    export RETRY_MAX_INTERVAL="${retry_max_interval}"
+    export INDIVIDUAL_REQUEST_TIMEOUT="${individual_request_timeout}"
+
+    # Force the binary layer of the stdout and stderr streams to be unbuffered.
+    python -u <<CODE
+    from pipeline_tools import input_utils
+
+    input_utils.create_optimus_input_tsv(
+                    "${bundle_uuid}",
+                    "${bundle_version}",
+                    "${dss_url}")
+
+    CODE
+  >>>
+  runtime {
+    docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:" + pipeline_tools_version
+  }
+  output {
+    String sample_id = read_string("sample_id.txt")
+    Array[File] r1_fastq = read_lines("r1.txt")
+    Array[File] r2_fastq = read_lines("r2.txt")
+    Array[File] i1_fastq = read_lines("i1.txt")
+    Array[Int] lanes = read_lines("lanes.txt")
+    Array[File] http_requests = glob("request_*.txt")
+    Array[File] http_responses = glob("response_*.txt")
+  }
+}
+
+task RenameFiles {
+    File r1
+    File r2
+    File i1
+    String sample_id
+    String lane
+    String pipeline_tools_version
+
+    command <<<
+      mv ${r1} '${sample_id}_S1_L00${lane}_R1_001.fastq.gz'
+      mv ${r2} '${sample_id}_S1_L00${lane}_R2_001.fastq.gz'
+      mv ${i1} '${sample_id}_S1_L00${lane}_I1_001.fastq.gz'
+      >>>
+      runtime {
+        docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:" + pipeline_tools_version
+      }
+      output {
+        File r1_new = "${sample_id}_S1_L00${lane}_R1_001.fastq.gz"
+        File r2_new = "${sample_id}_S1_L00${lane}_R2_001.fastq.gz"
+        File i1_new = "${sample_id}_S1_L00${lane}_I1_001.fastq.gz"
+      }
+}
+
+task InputsForSubmit {
+    Array[File] fastqs
+    Array[Object] other_inputs
+    Int? expect_cells
+    String pipeline_tools_version
+
+    command <<<
+      # Force the binary layer of the stdout and stderr streams to be unbuffered.
+      python -u <<CODE
+
+      inputs = []
+
+      print('fastq_files')
+      inputs.append({"name": "fastqs", "value": "${sep=', ' fastqs}"})
+
+      print('other inputs')
+      with open('${write_objects(other_inputs)}') as f:
+          keys = f.readline().strip().split('\t')
+          for line in f:
+              values = line.strip().split('\t')
+              input = {}
+              for i, key in enumerate(keys):
+                  input[key] = values[i]
+              print(input)
+              inputs.append(input)
+
+      print('expect cells')
+      if "${expect_cells}":
+          inputs.append({"name": "expect_cells", "value": "${expect_cells}"})
+
+      print('write inputs.tsv')
+      with open('inputs.tsv', 'w') as f:
+          f.write('name\tvalue\n')
+          for input in inputs:
+              print(input)
+              f.write('{0}\t{1}\n'.format(input['name'], input['value']))
+      print('finished')
+      CODE
+    >>>
+
+    runtime {
+      docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:" + pipeline_tools_version
+    }
+
+    output {
+      File inputs = "inputs.tsv"
+    }
+}
+
+workflow Adapter10xCount {
+  String bundle_uuid
+  String bundle_version
+
+  String reference_name
+  File transcriptome_tar_gz
+  Int? expect_cells
+
+  # Submission
+  File format_map
+  String dss_url
+  String submit_url
+  String method
+  String schema_url
+  String cromwell_url
+  String analysis_process_schema_version
+  String analysis_protocol_schema_version
+  String analysis_file_version
+  String run_type
+  Int? retry_max_interval
+  Float? retry_multiplier
+  Int? retry_timeout
+  Int? individual_request_timeout
+  String reference_bundle
+  Boolean use_caas
+
+  # Set runtime environment such as "dev" or "staging" or "prod" so submit task could choose proper docker image to use
+  String runtime_environment
+  # By default, don't record http requests, unless we override in inputs json
+  Boolean record_http = false
+  Int max_cromwell_retries = 0
+  Boolean add_md5s = false
+
+  String pipeline_tools_version = "v0.32.0"
+
+  call GetInputs {
+    input:
+      bundle_uuid = bundle_uuid,
+      bundle_version = bundle_version,
+      dss_url = dss_url,
+      retry_multiplier = retry_multiplier,
+      retry_max_interval = retry_max_interval,
+      retry_timeout = retry_timeout,
+      individual_request_timeout = individual_request_timeout,
+      record_http = record_http,
+      pipeline_tools_version = pipeline_tools_version
+  }
+
+  # Cellranger code in 10x count wdl requires files to be named a certain way.
+  # To accommodate that, RenameFiles copies the blue box files into the
+  # cromwell execution bucket but with the names cellranger expects.
+  # Putting this in its own task lets us take advantage of automatic localizing
+  # and delocalizing by Cromwell/JES to actually read and write stuff in buckets.
+  # TODO: Replace scatter with a for-loop inside of the task to avoid creating a
+  # VM for each set of files that needs to be renamed
+  scatter(i in range(length(GetInputs.lanes))) {
+    call RenameFiles as prep {
+      input:
+        r1 = GetInputs.r1_fastq[i],
+        r2 = GetInputs.r2_fastq[i],
+        i1 = GetInputs.i1_fastq[i],
+        sample_id = GetInputs.sample_id,
+        lane = GetInputs.lanes[i],
+        pipeline_tools_version = pipeline_tools_version
+      }
+    }
+
+  # CellRanger gets the paths to the fastq directories from the array of fastqs,
+  # so the order of those files does not matter
+  call CellRanger.CellRanger as analysis {
+    input:
+      sample_id = GetInputs.sample_id,
+      fastqs = flatten([prep.r1_new, prep.r2_new, prep.i1_new]),
+      reference_name = reference_name,
+      transcriptome_tar_gz = transcriptome_tar_gz,
+      expect_cells = expect_cells
+  }
+
+  call InputsForSubmit {
+    input:
+      fastqs = flatten([GetInputs.r1_fastq, GetInputs.r2_fastq, GetInputs.i1_fastq]),
+      other_inputs = [
+        {
+          "name": "sample_id",
+          "value": GetInputs.sample_id
+        },
+        {
+          "name": "reference_name",
+          "value": reference_name
+        },
+        {
+          "name": "transcriptome_tar_gz",
+          "value": transcriptome_tar_gz
+        }
+      ],
+      expect_cells = expect_cells,
+      pipeline_tools_version = pipeline_tools_version
+  }
+
+  Array[Object] inputs = read_objects(InputsForSubmit.inputs)
+
+  call submit_wdl.submit {
+    input:
+      inputs = inputs,
+      outputs = [
+        analysis.qc,
+        analysis.sorted_bam,
+        analysis.sorted_bam_index,
+        analysis.barcodes,
+        analysis.genes,
+        analysis.matrix,
+        analysis.filtered_gene_h5,
+        analysis.raw_gene_h5,
+        analysis.raw_barcodes,
+        analysis.raw_genes,
+        analysis.raw_matrix,
+        analysis.mol_info_h5
+      ],
+      format_map = format_map,
+      submit_url = submit_url,
+      cromwell_url = cromwell_url,
+      input_bundle_uuid = bundle_uuid,
+      reference_bundle = reference_bundle,
+      run_type = run_type,
+      schema_url = schema_url,
+      analysis_process_schema_version = analysis_process_schema_version,
+      analysis_protocol_schema_version = analysis_protocol_schema_version,
+      analysis_file_version = analysis_file_version,
+      method = method,
+      retry_multiplier = retry_multiplier,
+      retry_max_interval = retry_max_interval,
+      retry_timeout = retry_timeout,
+      individual_request_timeout = individual_request_timeout,
+      runtime_environment = runtime_environment,
+      use_caas = use_caas,
+      record_http = record_http,
+      pipeline_tools_version = pipeline_tools_version,
+      add_md5s = add_md5s,
+      pipeline_version = analysis.pipeline_version
+  }
+}
diff --git a/adapter_pipelines/cellranger/adapter_example_static.json b/adapter_pipelines/cellranger/adapter_example_static.json
@@ -0,0 +1,13 @@
+{
+  "Adapter10xCount.reference_name": "GRCh38",
+  "Adapter10xCount.transcriptome_tar_gz": "gs://hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38_GencodeV27_Primary_CellRanger.tar",
+  "Adapter10xCount.expect_cells": 5000,
+  "Adapter10xCount.reference_bundle": "bf51d668-3e14-4843-9bc7-5d676fdf0e01",
+  "Adapter10xCount.format_map": "gs://hca-dcp-mint-test-data/adapters/file_format_map.json",
+  "Adapter10xCount.method": "10x",
+  "Adapter10xCount.analysis_file_version": "5.3.4",
+  "Adapter10xCount.analysis_protocol_schema_version": "8.0.3",
+  "Adapter10xCount.analysis_process_schema_version": "8.0.3",
+  "Adapter10xCount.run_type": "run",
+  "Adapter10xCount.add_md5s": false
+}
diff --git a/adapter_pipelines/cellranger/options.json b/adapter_pipelines/cellranger/options.json
@@ -0,0 +1,3 @@
+{
+  "read_from_cache": true
+}
diff --git a/adapter_pipelines/ss2_single_sample/adapter.wdl b/adapter_pipelines/ss2_single_sample/adapter.wdl
@@ -83,7 +83,7 @@ workflow AdapterSmartSeq2SingleCell{
   Int max_cromwell_retries = 0
   Boolean add_md5s = false
 
-  String pipeline_tools_version = "v0.31.0"
+  String pipeline_tools_version = "v0.32.0"
 
   call GetInputs as prep {
     input:
diff --git a/pipeline_tools/input_utils.py b/pipeline_tools/input_utils.py
@@ -207,6 +207,10 @@ def create_optimus_input_tsv(uuid, version, dss_url):
     with open('i1.txt', 'w') as f:
         for url in i1_urls:
             f.write(url + '\n')
+    with open('lanes.txt', 'w') as f:
+        lane_numbers = sorted(lane_to_fastqs.keys())
+        for l in lane_numbers:
+            f.write(str(l))
 
     sample_id = get_sample_id(primary_bundle)
     print('Writing sample ID to sample_id.txt')

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "read_from_cache": true`
	`3`	`+}`