Skip to content

Commit cc29c4c

Browse files
author
David Shiga
authored
Merge pull request #26 from HumanCellAtlas/ds-get-sample-id-v5-598
Support getting sample id from v5 metadata
2 parents 3cbef2a + aece7b7 commit cc29c4c

File tree

11 files changed

+458
-139
lines changed

11 files changed

+458
-139
lines changed

adapter_pipelines/Optimus/adapter.wdl

Lines changed: 10 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,43 +11,19 @@ task GetInputs {
1111

1212
command <<<
1313
python <<CODE
14-
from pipeline_tools import dcp_utils
1514
from pipeline_tools import input_utils
1615
17-
# Get bundle manifest
18-
uuid = '${bundle_uuid}'
19-
version = '${bundle_version}'
20-
dss_url = '${dss_url}'
21-
retry_seconds = ${retry_seconds}
22-
timeout_seconds = ${timeout_seconds}
23-
print('Getting bundle manifest for id {0}, version {1}'.format(uuid, version))
24-
manifest = dcp_utils.get_manifest(uuid, version, dss_url, timeout_seconds, retry_seconds)
25-
manifest_files = dcp_utils.get_manifest_file_dicts(manifest)
26-
27-
input_metadata_file_uuid = input_utils.get_input_metadata_file_uuid(manifest_files)
28-
input_metadata_json = dcp_utils.get_file_by_uuid(input_metadata_file_uuid, dss_url)
29-
30-
# Parse inputs from metadata and write to fastq_inputs
31-
print('Writing fastq inputs to fastq_inputs.tsv')
32-
sample_id = input_utils.get_sample_id(input_metadata_json)
33-
lanes = input_utils.get_optimus_lanes(input_metadata_json)
34-
r1, r2, i1 = input_utils.get_optimus_inputs(lanes, manifest_files)
35-
fastq_inputs = [list(i) for i in zip(r1, r2, i1)]
36-
print(fastq_inputs)
37-
38-
with open('fastq_inputs.tsv', 'w') as f:
39-
for line in fastq_inputs:
40-
f.write('\t'.join(line) +'\n')
41-
42-
print('Writing sample ID to inputs.tsv')
43-
sample_id = input_utils.get_sample_id(input_metadata_json)
44-
with open('inputs.tsv', 'w') as f:
45-
f.write('{0}'.format(sample_id))
46-
print('Wrote input map')
16+
input_utils.create_optimus_input_tsv(
17+
"${bundle_manifest}",
18+
"${bundle_version}",
19+
"${dss_url}",
20+
${retry_seconds},
21+
${timeout_seconds})
22+
4723
CODE
4824
>>>
4925
runtime {
50-
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11"
26+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.12.0"
5127
}
5228
output {
5329
String sample_id = read_string("inputs.tsv")
@@ -88,7 +64,7 @@ task inputs_for_submit {
8864
>>>
8965

9066
runtime {
91-
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11"
67+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.12.0"
9268
}
9369

9470
output {
@@ -127,7 +103,7 @@ task outputs_for_submit {
127103
>>>
128104

129105
runtime {
130-
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11"
106+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.12.0"
131107
}
132108

133109
output {

adapter_pipelines/ss2_single_sample/adapter.wdl

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,19 @@ task GetInputs {
1111

1212
command <<<
1313
python <<CODE
14-
from pipeline_tools import dcp_utils
1514
from pipeline_tools import input_utils
1615
17-
# Get bundle manifest
18-
uuid = "${bundle_uuid}"
19-
version = "${bundle_version}"
20-
dss_url = "${dss_url}"
21-
retry_seconds = ${retry_seconds}
22-
timeout_seconds = ${timeout_seconds}
23-
print("Getting bundle manifest for id {0}, version {1}".format(uuid, version))
24-
manifest = dcp_utils.get_manifest(uuid, version, dss_url, timeout_seconds, retry_seconds)
25-
manifest_files = dcp_utils.get_manifest_file_dicts(manifest)
16+
input_utils.create_ss2_input_tsv(
17+
"${bundle_uuid}",
18+
"${bundle_version}",
19+
"${dss_url}",
20+
${retry_seconds},
21+
${timeout_seconds})
2622
27-
inputs_metadata_file_uuid = input_utils.get_input_metadata_file_uuid(manifest_files)
28-
inputs_metadata_json = dcp_utils.get_file_by_uuid(inputs_metadata_file_uuid, dss_url)
29-
30-
sample_id = input_utils.get_sample_id(inputs_metadata_json)
31-
fastq_1_name, fastq_2_name = input_utils.get_smart_seq_2_fastq_names(inputs_metadata_json)
32-
fastq_1_url = dcp_utils.get_file_url(manifest_files, fastq_1_name)
33-
fastq_2_url = dcp_utils.get_file_url(manifest_files, fastq_2_name)
34-
35-
print("Creating input map")
36-
with open("inputs.tsv", "w") as f:
37-
f.write("fastq_1\tfastq_2\tsample_id\n")
38-
f.write("{0}\t{1}\t{2}\n".format(fastq_1_url, fastq_2_url, sample_id))
39-
print("Wrote input map")
4023
CODE
4124
>>>
4225
runtime {
43-
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11"
26+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.12.0"
4427
}
4528
output {
4629
Object inputs = read_object("inputs.tsv")

pipeline_tools/input_utils.py

Lines changed: 135 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,82 @@
11
from pipeline_tools import dcp_utils
22

33

4-
def get_sample_id(metadata, version='4'):
4+
def get_sample_id(metadata, version):
55
"""Return the sample id from the given metadata"""
6-
if version == '4':
6+
if version.startswith('4.'):
77
return _get_sample_id_v4(metadata)
8+
elif version.startswith('5.'):
9+
return _get_sample_id_v5(metadata)
810
else:
9-
raise NotImplementedError('Only implemented for v4 metadata')
11+
raise NotImplementedError('Only implemented for v4 and v5 metadata')
12+
13+
14+
def _get_sample_id_v5(links_json):
15+
"""Return sample id from links json"""
16+
links = links_json['links']
17+
def is_sample_link(link):
18+
return link['source_type'] == 'biomaterial' and link['destination_type'] == 'sequencing_process'
19+
sample_links = list(filter(lambda x: is_sample_link(x), links))
20+
num_links = len(sample_links)
21+
if num_links == 0:
22+
raise ValueError('No sample link found')
23+
elif num_links > 1:
24+
raise ValueError('Expecting one sample link. {0} found'.format(num_links))
25+
return sample_links[0]['source_id']
1026

1127

1228
def _get_sample_id_v4(assay_json):
1329
"""Return sample id from assay json"""
1430
return assay_json["has_input"]
1531

1632

17-
def get_input_metadata_file_uuid(manifest_files, version='4'):
33+
def get_input_metadata_file_uuid(manifest_files, version):
1834
"""Get the uuid of the file containing metadata about pipeline input files,
1935
e.g. assay.json in v4"""
20-
if version == '5':
36+
if version.startswith('5.'):
2137
return _get_input_metadata_file_uuid_v5(manifest_files)
22-
elif version == '4':
38+
elif version.startswith('4.'):
2339
return _get_input_metadata_file_uuid_v4(manifest_files)
2440
else:
2541
raise NotImplementedError('Only implemented for v4 and v5 metadata')
2642

2743

2844
def _get_input_metadata_file_uuid_v5(manifest_files):
2945
"""Get the uuid of the files.json file"""
30-
return dcp_utils.get_file_uuid(manifest_files, 'files.json')
46+
return dcp_utils.get_file_uuid(manifest_files, 'file.json')
3147

3248

3349
def _get_input_metadata_file_uuid_v4(manifest_files):
3450
"""Get the uuid of the assay.json file"""
3551
return dcp_utils.get_file_uuid(manifest_files, 'assay.json')
3652

3753

38-
def get_smart_seq_2_fastq_names(metadata, version='4'):
54+
def get_sample_id_file_uuid(manifest_files, version):
55+
"""Get the uuid of the file containing the sample id,
56+
e.g. assay.json in v4"""
57+
if version.startswith('5.'):
58+
return _get_sample_id_file_uuid_v5(manifest_files)
59+
elif version.startswith('4.'):
60+
return _get_sample_id_file_uuid_v4(manifest_files)
61+
else:
62+
raise NotImplementedError('Only implemented for v4 and v5 metadata')
63+
64+
65+
def _get_sample_id_file_uuid_v5(manifest_files):
66+
"""Get the uuid of the links.json file"""
67+
return dcp_utils.get_file_uuid(manifest_files, 'links.json')
68+
69+
70+
def _get_sample_id_file_uuid_v4(manifest_files):
71+
"""Get the uuid of the assay.json file"""
72+
return dcp_utils.get_file_uuid(manifest_files, 'assay.json')
73+
74+
75+
def get_smart_seq_2_fastq_names(metadata, version):
3976
"""Get the fastq file names from the given metadata"""
40-
if version == '5':
77+
if version.startswith('5.'):
4178
return _get_smart_seq_2_fastq_names_v5(metadata)
42-
elif version == '4':
79+
elif version.startswith('4.'):
4380
return _get_smart_seq_2_fastq_names_v4(metadata)
4481
else:
4582
raise NotImplementedError('Only implemented for v4 and v5 metadata')
@@ -62,9 +99,9 @@ def _get_smart_seq_2_fastq_names_v4(assay_json):
6299
return fastq_1_name, fastq_2_name
63100

64101

65-
def get_optimus_lanes(metadata_json, version='4'):
102+
def get_optimus_lanes(metadata_json, version):
66103
"""Get the lane metadata"""
67-
if version == '4':
104+
if version.startswith('4.'):
68105
return _get_optimus_lanes_v4(metadata_json)
69106
else:
70107
raise NotImplementedError('Only implemented for v4 metadata')
@@ -85,3 +122,89 @@ def get_optimus_inputs(lanes, manifest_files):
85122
i1 = [manifest_files['name_to_meta'][lane['i1']]['url'] for lane in lanes]
86123

87124
return r1, r2, i1
125+
126+
127+
def is_v5_or_higher(name_to_meta):
128+
"""Return true if schema of bundle metadata is at least 5.0.0"""
129+
if 'assay.json' in name_to_meta:
130+
return False
131+
elif 'file.json' in name_to_meta:
132+
return True
133+
else:
134+
raise ValueError('No assay.json and no process.json. Cannot determine metadata schema version.')
135+
136+
137+
def detect_schema_version(file_json):
138+
"""Return the bundle's metadata schema version"""
139+
files = file_json['files']
140+
if len(files) == 0:
141+
raise ValueError('No files in bundle')
142+
schema_url = files[0]['content']['describedBy']
143+
version = schema_url.split('/')[-2]
144+
return version
145+
146+
147+
def get_metadata_to_process(manifest_files, dss_url, is_v5_or_higher):
148+
"""Return the metadata json that we need to parse to set up pipeline inputs"""
149+
if not is_v5_or_higher:
150+
schema_version = '4.x'
151+
sample_id_file_uuid = dcp_utils.get_file_uuid(manifest_files, 'sample.json')
152+
inputs_metadata_file_uuid = dcp_utils.get_file_uuid(manifest_files, 'assay.json')
153+
inputs_metadata_json = dcp_utils.get_file_by_uuid(inputs_metadata_file_uuid, dss_url)
154+
sample_id_file_json = inputs_metadata_json
155+
else:
156+
sample_id_file_uuid = dcp_utils.get_file_uuid(manifest_files, 'links.json')
157+
inputs_metadata_file_uuid = dcp_utils.get_file_uuid(manifest_files, 'file.json')
158+
inputs_metadata_json = dcp_utils.get_file_by_uuid(inputs_metadata_file_uuid, dss_url)
159+
schema_version = detect_schema_version(inputs_metadata_json)
160+
sample_id_file_json = dcp_utils.get_file_by_uuid(sample_id_file_uuid, dss_url)
161+
return inputs_metadata_json, sample_id_file_json, schema_version
162+
163+
164+
def create_ss2_input_tsv(uuid, version, dss_url, retry_seconds, timeout_seconds):
165+
"""Create tsv of Smart-seq2 inputs"""
166+
# Get bundle manifest
167+
print("Getting bundle manifest for id {0}, version {1}".format(uuid, version))
168+
manifest = dcp_utils.get_manifest(uuid, version, dss_url, timeout_seconds, retry_seconds)
169+
manifest_files = dcp_utils.get_manifest_file_dicts(manifest)
170+
171+
inputs_metadata_json, sample_id_file_json, schema_version = get_metadata_to_process(
172+
manifest_files, dss_url, is_v5_or_higher(manifest_files['name_to_meta']))
173+
174+
sample_id = get_sample_id(sample_id_file_json, schema_version)
175+
fastq_1_name, fastq_2_name = get_smart_seq_2_fastq_names(inputs_metadata_json, schema_version)
176+
fastq_1_url = dcp_utils.get_file_url(manifest_files, fastq_1_name)
177+
fastq_2_url = dcp_utils.get_file_url(manifest_files, fastq_2_name)
178+
179+
print("Creating input map")
180+
with open("inputs.tsv", "w") as f:
181+
f.write("fastq_1\tfastq_2\tsample_id\n")
182+
f.write("{0}\t{1}\t{2}\n".format(fastq_1_url, fastq_2_url, sample_id))
183+
print("Wrote input map")
184+
185+
186+
def create_optimus_input_tsv(uuid, version, dss_url, retry_seconds, timeout_seconds):
187+
"""Create tsv of Optimus inputs"""
188+
# Get bundle manifest
189+
print('Getting bundle manifest for id {0}, version {1}'.format(uuid, version))
190+
manifest = dcp_utils.get_manifest(uuid, version, dss_url, timeout_seconds, retry_seconds)
191+
manifest_files = dcp_utils.get_manifest_file_dicts(manifest)
192+
193+
inputs_metadata_json, sample_id_file_json, schema_version = get_metadata_to_process(
194+
manifest_files, dss_url, is_v5_or_higher(manifest_files['name_to_meta']))
195+
196+
# Parse inputs from metadata and write to fastq_inputs
197+
print('Writing fastq inputs to fastq_inputs.tsv')
198+
sample_id = get_sample_id(sample_id_file_json, schema_version)
199+
lanes = get_optimus_lanes(input_metadata_json, schema_version)
200+
r1, r2, i1 = get_optimus_inputs(lanes, manifest_files)
201+
fastq_inputs = [list(i) for i in zip(r1, r2, i1)]
202+
print(fastq_inputs)
203+
204+
with open('fastq_inputs.tsv', 'w') as f:
205+
for line in fastq_inputs:
206+
f.write('\t'.join(line) +'\n')
207+
print('Writing sample ID to inputs.tsv')
208+
f.write('{0}'.format(sample_id))
209+
210+
print('Wrote input map')
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
{
2+
"links": [
3+
{
4+
"source_id": "7c72f20a-8199-497a-bba7-1316520ba847",
5+
"source_type": "biomaterial",
6+
"destination_id": "336c7184-70a7-41b3-ab31-98f78b0ab092",
7+
"destination_type": "dissociation_process"
8+
},
9+
{
10+
"source_id": "336c7184-70a7-41b3-ab31-98f78b0ab092",
11+
"source_type": "dissociation_process",
12+
"destination_id": "94049366-4e4b-4076-8474-b2a00d7aa08d",
13+
"destination_type": "biomaterial"
14+
},
15+
{
16+
"source_id": "8c01ac9e-751b-4426-9032-c4235503a164",
17+
"source_type": "process",
18+
"destination_id": "7c72f20a-8199-497a-bba7-1316520ba847",
19+
"destination_type": "biomaterial"
20+
},
21+
{
22+
"source_id": "cd9d52fa-e89d-4c37-bb0e-997a2affe1dc",
23+
"source_type": "biomaterial",
24+
"destination_id": "8c01ac9e-751b-4426-9032-c4235503a164",
25+
"destination_type": "process"
26+
},
27+
{
28+
"source_id": "test_sample_id",
29+
"source_type": "biomaterial",
30+
"destination_id": "adebcaaa-0cea-44af-8cce-fdc5dcadb286",
31+
"destination_type": "sequencing_process"
32+
},
33+
{
34+
"source_id": "adebcaaa-0cea-44af-8cce-fdc5dcadb286",
35+
"source_type": "sequencing_process",
36+
"destination_id": "cb4f1900-cdcf-4e3d-834e-21c6b3e59237",
37+
"destination_type": "file"
38+
},
39+
{
40+
"source_id": "adebcaaa-0cea-44af-8cce-fdc5dcadb286",
41+
"source_type": "sequencing_process",
42+
"destination_id": "ed7989fb-1ca0-48ba-9b81-f83c4af7b091",
43+
"destination_type": "file"
44+
}
45+
],
46+
"describedBy": "https://schema.humancellatlas.org/bundle/1.0.0/links",
47+
"schema_version": "1.0.0",
48+
"schema_type": "link_bundle"
49+
}

0 commit comments

Comments
 (0)