Skip to content

Commit 302eb3f

Browse files
authored
10x Adapter WDL (#83)
* Add adapter wdl for cellranger 10x wdl * Rename fastqs cellranger inputs * Change docker image for testing * Test renaming files * Fix renaming files * Use sample name instead of id * Retest * Fix getting file names * Handle case where expect_cells is not defined * Add cellranger inputs, options and readme files * Style changes Also use bash instead of python for renaming files * Update pipeline_tools_version tag * Update cellranger reference
1 parent e840fd2 commit 302eb3f

File tree

7 files changed

+296
-2
lines changed

7 files changed

+296
-2
lines changed

adapter_pipelines/Optimus/adapter.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ workflow AdapterOptimus {
127127
Int max_cromwell_retries = 0
128128
Boolean add_md5s = false
129129

130-
String pipeline_tools_version = "v0.29.0"
130+
String pipeline_tools_version = "v0.32.0"
131131

132132
call GetInputs as prep {
133133
input:
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Overview
2+
3+
This directory contains an adapter pipeline used by the Secondary Analysis Service to run the CellRanger count pipeline
4+
5+
# Files
6+
7+
* adapter.wdl
8+
The adapter pipeline, which parses a bundle manifest from the Data Storage Service, runs the CellRanger count analysis pipeline, then runs the submission pipeline to submit the results to the Ingest Service.
9+
10+
* adapter_example_static.json
11+
Example inputs to use when running this pipeline that stay the same for every run.
12+
13+
* options.json
14+
Options file to use when running workflow.
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
import "cellranger.wdl" as CellRanger
2+
import "submit.wdl" as submit_wdl
3+
4+
5+
task GetInputs {
6+
String bundle_uuid
7+
String bundle_version
8+
String dss_url
9+
Int? retry_timeout
10+
Float? retry_multiplier
11+
Int? retry_max_interval
12+
Int? individual_request_timeout
13+
Boolean record_http
14+
String pipeline_tools_version
15+
16+
command <<<
17+
export RECORD_HTTP_REQUESTS="${record_http}"
18+
export RETRY_TIMEOUT="${retry_timeout}"
19+
export RETRY_MULTIPLIER="${retry_multiplier}"
20+
export RETRY_MAX_INTERVAL="${retry_max_interval}"
21+
export INDIVIDUAL_REQUEST_TIMEOUT="${individual_request_timeout}"
22+
23+
# Force the binary layer of the stdout and stderr streams to be unbuffered.
24+
python -u <<CODE
25+
from pipeline_tools import input_utils
26+
27+
input_utils.create_optimus_input_tsv(
28+
"${bundle_uuid}",
29+
"${bundle_version}",
30+
"${dss_url}")
31+
32+
CODE
33+
>>>
34+
runtime {
35+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:" + pipeline_tools_version
36+
}
37+
output {
38+
String sample_id = read_string("sample_id.txt")
39+
Array[File] r1_fastq = read_lines("r1.txt")
40+
Array[File] r2_fastq = read_lines("r2.txt")
41+
Array[File] i1_fastq = read_lines("i1.txt")
42+
Array[Int] lanes = read_lines("lanes.txt")
43+
Array[File] http_requests = glob("request_*.txt")
44+
Array[File] http_responses = glob("response_*.txt")
45+
}
46+
}
47+
48+
task RenameFiles {
49+
File r1
50+
File r2
51+
File i1
52+
String sample_id
53+
String lane
54+
String pipeline_tools_version
55+
56+
command <<<
57+
mv ${r1} '${sample_id}_S1_L00${lane}_R1_001.fastq.gz'
58+
mv ${r2} '${sample_id}_S1_L00${lane}_R2_001.fastq.gz'
59+
mv ${i1} '${sample_id}_S1_L00${lane}_I1_001.fastq.gz'
60+
>>>
61+
runtime {
62+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:" + pipeline_tools_version
63+
}
64+
output {
65+
File r1_new = "${sample_id}_S1_L00${lane}_R1_001.fastq.gz"
66+
File r2_new = "${sample_id}_S1_L00${lane}_R2_001.fastq.gz"
67+
File i1_new = "${sample_id}_S1_L00${lane}_I1_001.fastq.gz"
68+
}
69+
}
70+
71+
task InputsForSubmit {
72+
Array[File] fastqs
73+
Array[Object] other_inputs
74+
Int? expect_cells
75+
String pipeline_tools_version
76+
77+
command <<<
78+
# Force the binary layer of the stdout and stderr streams to be unbuffered.
79+
python -u <<CODE
80+
81+
inputs = []
82+
83+
print('fastq_files')
84+
inputs.append({"name": "fastqs", "value": "${sep=', ' fastqs}"})
85+
86+
print('other inputs')
87+
with open('${write_objects(other_inputs)}') as f:
88+
keys = f.readline().strip().split('\t')
89+
for line in f:
90+
values = line.strip().split('\t')
91+
input = {}
92+
for i, key in enumerate(keys):
93+
input[key] = values[i]
94+
print(input)
95+
inputs.append(input)
96+
97+
print('expect cells')
98+
if "${expect_cells}":
99+
inputs.append({"name": "expect_cells", "value": "${expect_cells}"})
100+
101+
print('write inputs.tsv')
102+
with open('inputs.tsv', 'w') as f:
103+
f.write('name\tvalue\n')
104+
for input in inputs:
105+
print(input)
106+
f.write('{0}\t{1}\n'.format(input['name'], input['value']))
107+
print('finished')
108+
CODE
109+
>>>
110+
111+
runtime {
112+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:" + pipeline_tools_version
113+
}
114+
115+
output {
116+
File inputs = "inputs.tsv"
117+
}
118+
}
119+
120+
workflow Adapter10xCount {
121+
String bundle_uuid
122+
String bundle_version
123+
124+
String reference_name
125+
File transcriptome_tar_gz
126+
Int? expect_cells
127+
128+
# Submission
129+
File format_map
130+
String dss_url
131+
String submit_url
132+
String method
133+
String schema_url
134+
String cromwell_url
135+
String analysis_process_schema_version
136+
String analysis_protocol_schema_version
137+
String analysis_file_version
138+
String run_type
139+
Int? retry_max_interval
140+
Float? retry_multiplier
141+
Int? retry_timeout
142+
Int? individual_request_timeout
143+
String reference_bundle
144+
Boolean use_caas
145+
146+
# Set runtime environment such as "dev" or "staging" or "prod" so submit task could choose proper docker image to use
147+
String runtime_environment
148+
# By default, don't record http requests, unless we override in inputs json
149+
Boolean record_http = false
150+
Int max_cromwell_retries = 0
151+
Boolean add_md5s = false
152+
153+
String pipeline_tools_version = "v0.32.0"
154+
155+
call GetInputs {
156+
input:
157+
bundle_uuid = bundle_uuid,
158+
bundle_version = bundle_version,
159+
dss_url = dss_url,
160+
retry_multiplier = retry_multiplier,
161+
retry_max_interval = retry_max_interval,
162+
retry_timeout = retry_timeout,
163+
individual_request_timeout = individual_request_timeout,
164+
record_http = record_http,
165+
pipeline_tools_version = pipeline_tools_version
166+
}
167+
168+
# Cellranger code in 10x count wdl requires files to be named a certain way.
169+
# To accommodate that, RenameFiles copies the blue box files into the
170+
# cromwell execution bucket but with the names cellranger expects.
171+
# Putting this in its own task lets us take advantage of automatic localizing
172+
# and delocalizing by Cromwell/JES to actually read and write stuff in buckets.
173+
# TODO: Replace scatter with a for-loop inside of the task to avoid creating a
174+
# VM for each set of files that needs to be renamed
175+
scatter(i in range(length(GetInputs.lanes))) {
176+
call RenameFiles as prep {
177+
input:
178+
r1 = GetInputs.r1_fastq[i],
179+
r2 = GetInputs.r2_fastq[i],
180+
i1 = GetInputs.i1_fastq[i],
181+
sample_id = GetInputs.sample_id,
182+
lane = GetInputs.lanes[i],
183+
pipeline_tools_version = pipeline_tools_version
184+
}
185+
}
186+
187+
# CellRanger gets the paths to the fastq directories from the array of fastqs,
188+
# so the order of those files does not matter
189+
call CellRanger.CellRanger as analysis {
190+
input:
191+
sample_id = GetInputs.sample_id,
192+
fastqs = flatten([prep.r1_new, prep.r2_new, prep.i1_new]),
193+
reference_name = reference_name,
194+
transcriptome_tar_gz = transcriptome_tar_gz,
195+
expect_cells = expect_cells
196+
}
197+
198+
call InputsForSubmit {
199+
input:
200+
fastqs = flatten([GetInputs.r1_fastq, GetInputs.r2_fastq, GetInputs.i1_fastq]),
201+
other_inputs = [
202+
{
203+
"name": "sample_id",
204+
"value": GetInputs.sample_id
205+
},
206+
{
207+
"name": "reference_name",
208+
"value": reference_name
209+
},
210+
{
211+
"name": "transcriptome_tar_gz",
212+
"value": transcriptome_tar_gz
213+
}
214+
],
215+
expect_cells = expect_cells,
216+
pipeline_tools_version = pipeline_tools_version
217+
}
218+
219+
Array[Object] inputs = read_objects(InputsForSubmit.inputs)
220+
221+
call submit_wdl.submit {
222+
input:
223+
inputs = inputs,
224+
outputs = [
225+
analysis.qc,
226+
analysis.sorted_bam,
227+
analysis.sorted_bam_index,
228+
analysis.barcodes,
229+
analysis.genes,
230+
analysis.matrix,
231+
analysis.filtered_gene_h5,
232+
analysis.raw_gene_h5,
233+
analysis.raw_barcodes,
234+
analysis.raw_genes,
235+
analysis.raw_matrix,
236+
analysis.mol_info_h5
237+
],
238+
format_map = format_map,
239+
submit_url = submit_url,
240+
cromwell_url = cromwell_url,
241+
input_bundle_uuid = bundle_uuid,
242+
reference_bundle = reference_bundle,
243+
run_type = run_type,
244+
schema_url = schema_url,
245+
analysis_process_schema_version = analysis_process_schema_version,
246+
analysis_protocol_schema_version = analysis_protocol_schema_version,
247+
analysis_file_version = analysis_file_version,
248+
method = method,
249+
retry_multiplier = retry_multiplier,
250+
retry_max_interval = retry_max_interval,
251+
retry_timeout = retry_timeout,
252+
individual_request_timeout = individual_request_timeout,
253+
runtime_environment = runtime_environment,
254+
use_caas = use_caas,
255+
record_http = record_http,
256+
pipeline_tools_version = pipeline_tools_version,
257+
add_md5s = add_md5s,
258+
pipeline_version = analysis.pipeline_version
259+
}
260+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"Adapter10xCount.reference_name": "GRCh38",
3+
"Adapter10xCount.transcriptome_tar_gz": "gs://hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38_GencodeV27_Primary_CellRanger.tar",
4+
"Adapter10xCount.expect_cells": 5000,
5+
"Adapter10xCount.reference_bundle": "bf51d668-3e14-4843-9bc7-5d676fdf0e01",
6+
"Adapter10xCount.format_map": "gs://hca-dcp-mint-test-data/adapters/file_format_map.json",
7+
"Adapter10xCount.method": "10x",
8+
"Adapter10xCount.analysis_file_version": "5.3.4",
9+
"Adapter10xCount.analysis_protocol_schema_version": "8.0.3",
10+
"Adapter10xCount.analysis_process_schema_version": "8.0.3",
11+
"Adapter10xCount.run_type": "run",
12+
"Adapter10xCount.add_md5s": false
13+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"read_from_cache": true
3+
}

adapter_pipelines/ss2_single_sample/adapter.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ workflow AdapterSmartSeq2SingleCell{
8383
Int max_cromwell_retries = 0
8484
Boolean add_md5s = false
8585

86-
String pipeline_tools_version = "v0.31.0"
86+
String pipeline_tools_version = "v0.32.0"
8787

8888
call GetInputs as prep {
8989
input:

pipeline_tools/input_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ def create_optimus_input_tsv(uuid, version, dss_url):
207207
with open('i1.txt', 'w') as f:
208208
for url in i1_urls:
209209
f.write(url + '\n')
210+
with open('lanes.txt', 'w') as f:
211+
lane_numbers = sorted(lane_to_fastqs.keys())
212+
for l in lane_numbers:
213+
f.write(str(l))
210214

211215
sample_id = get_sample_id(primary_bundle)
212216
print('Writing sample ID to sample_id.txt')

0 commit comments

Comments
 (0)