Skip to content

Commit 3cbef2a

Browse files
author
David Shiga
authored
Merge pull request #24 from HumanCellAtlas/ds_schema_test
Make it possible to handle different metadata versions using the same version of pipeline-tools code. Switch to pytest and add a metadata schema integration test. Add some support for v5 metadata.
2 parents f4004dc + 3cfa0fa commit 3cbef2a

File tree

18 files changed

+788
-55
lines changed

18 files changed

+788
-55
lines changed

.travis.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@ python:
33
- '2.7'
44
- '3.6'
55
install: pip install -r requirements.txt -r test-requirements.txt
6-
script: python -m unittest discover -v
6+
env:
7+
- TEST_SUITE=unit
8+
- TEST_SUITE=latest_schema
9+
script: bash test.sh $TEST_SUITE
710
notifications:
811
slack:
912
on_success: change

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,26 @@
33
This repo contains Python code and pipelines for interacting with the Human Cell Atlas Data Coordination Platform. They are used by the Secondary Analysis Service.
44

55
The pipelines wrap analysis pipelines from the Skylab repo and provide some glue to interface with the DCP. The adapter pipelines take bundle ids as inputs, query the Data Storage Service to find the input files needed by the analysis pipelines, then run the analysis pipelines and submit the results to the Ingest Service. This helps us keep the analysis pipelines themselves free of dependencies on the DCP.
6+
7+
## Run tests
8+
9+
### Create a virtual environment
10+
11+
```
12+
virtualenv pipeline-tools-test-env
13+
source pipeline-tools-test-env/bin/activate
14+
pip install -r test-requirements.txt
15+
```
16+
17+
### Run unit tests
18+
19+
```
20+
bash test.sh
21+
```
22+
23+
### Run schema tests
24+
25+
```
26+
export TEST_SUITE=latest_schema
27+
bash test.sh
28+
```

adapter_pipelines/Optimus/adapter.wdl

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ task GetInputs {
1212
command <<<
1313
python <<CODE
1414
from pipeline_tools import dcp_utils
15+
from pipeline_tools import input_utils
1516
1617
# Get bundle manifest
1718
uuid = '${bundle_uuid}'
@@ -20,18 +21,17 @@ task GetInputs {
2021
retry_seconds = ${retry_seconds}
2122
timeout_seconds = ${timeout_seconds}
2223
print('Getting bundle manifest for id {0}, version {1}'.format(uuid, version))
23-
manifest_files = dcp_utils.get_manifest_files(uuid, version, dss_url, timeout_seconds, retry_seconds)
24+
manifest = dcp_utils.get_manifest(uuid, version, dss_url, timeout_seconds, retry_seconds)
25+
manifest_files = dcp_utils.get_manifest_file_dicts(manifest)
2426
25-
print('Downloading assay.json')
26-
assay_json_uuid = manifest_files['name_to_meta']['assay.json']['uuid']
27-
assay_json = dcp_utils.get_file_by_uuid(assay_json_uuid, dss_url)
27+
input_metadata_file_uuid = input_utils.get_input_metadata_file_uuid(manifest_files)
28+
input_metadata_json = dcp_utils.get_file_by_uuid(input_metadata_file_uuid, dss_url)
2829
29-
# Parse inputs from assay_json and write to fastq_inputs
30+
# Parse inputs from metadata and write to fastq_inputs
3031
print('Writing fastq inputs to fastq_inputs.tsv')
31-
lanes = assay_json['content']['seq']['lanes']
32-
r1 = [manifest_files['name_to_meta'][lane['r1']]['url'] for lane in lanes]
33-
r2 = [manifest_files['name_to_meta'][lane['r2']]['url'] for lane in lanes]
34-
i1 = [manifest_files['name_to_meta'][lane['i1']]['url'] for lane in lanes]
32+
sample_id = input_utils.get_sample_id(input_metadata_json)
33+
lanes = input_utils.get_optimus_lanes(input_metadata_json)
34+
r1, r2, i1 = input_utils.get_optimus_inputs(lanes, manifest_files)
3535
fastq_inputs = [list(i) for i in zip(r1, r2, i1)]
3636
print(fastq_inputs)
3737
@@ -40,14 +40,14 @@ task GetInputs {
4040
f.write('\t'.join(line) +'\n')
4141
4242
print('Writing sample ID to inputs.tsv')
43-
sample_id = assay_json['has_input']
43+
sample_id = input_utils.get_sample_id(input_metadata_json)
4444
with open('inputs.tsv', 'w') as f:
4545
f.write('{0}'.format(sample_id))
4646
print('Wrote input map')
4747
CODE
4848
>>>
4949
runtime {
50-
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.9"
50+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11"
5151
}
5252
output {
5353
String sample_id = read_string("inputs.tsv")
@@ -88,7 +88,7 @@ task inputs_for_submit {
8888
>>>
8989

9090
runtime {
91-
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.9"
91+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11"
9292
}
9393

9494
output {
@@ -127,7 +127,7 @@ task outputs_for_submit {
127127
>>>
128128

129129
runtime {
130-
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.9"
130+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11"
131131
}
132132

133133
output {

adapter_pipelines/ss2_single_sample/adapter.wdl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ task GetInputs {
1212
command <<<
1313
python <<CODE
1414
from pipeline_tools import dcp_utils
15+
from pipeline_tools import input_utils
1516
1617
# Get bundle manifest
1718
uuid = "${bundle_uuid}"
@@ -20,17 +21,16 @@ task GetInputs {
2021
retry_seconds = ${retry_seconds}
2122
timeout_seconds = ${timeout_seconds}
2223
print("Getting bundle manifest for id {0}, version {1}".format(uuid, version))
23-
manifest_files = dcp_utils.get_manifest_files(uuid, version, dss_url, timeout_seconds, retry_seconds)
24+
manifest = dcp_utils.get_manifest(uuid, version, dss_url, timeout_seconds, retry_seconds)
25+
manifest_files = dcp_utils.get_manifest_file_dicts(manifest)
2426
25-
print("Downloading assay.json")
26-
assay_json_uuid = manifest_files["name_to_meta"]["assay.json"]["uuid"]
27-
assay_json = dcp_utils.get_file_by_uuid(assay_json_uuid, dss_url)
27+
inputs_metadata_file_uuid = input_utils.get_input_metadata_file_uuid(manifest_files)
28+
inputs_metadata_json = dcp_utils.get_file_by_uuid(inputs_metadata_file_uuid, dss_url)
2829
29-
sample_id = assay_json["has_input"]
30-
fastq_1_name = assay_json["content"]["seq"]["lanes"][0]["r1"]
31-
fastq_2_name = assay_json["content"]["seq"]["lanes"][0]["r2"]
32-
fastq_1_url = manifest_files["name_to_meta"][fastq_1_name]["url"]
33-
fastq_2_url = manifest_files["name_to_meta"][fastq_2_name]["url"]
30+
sample_id = input_utils.get_sample_id(inputs_metadata_json)
31+
fastq_1_name, fastq_2_name = input_utils.get_smart_seq_2_fastq_names(inputs_metadata_json)
32+
fastq_1_url = dcp_utils.get_file_url(manifest_files, fastq_1_name)
33+
fastq_2_url = dcp_utils.get_file_url(manifest_files, fastq_2_name)
3434
3535
print("Creating input map")
3636
with open("inputs.tsv", "w") as f:
@@ -40,7 +40,7 @@ task GetInputs {
4040
CODE
4141
>>>
4242
runtime {
43-
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.9"
43+
docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11"
4444
}
4545
output {
4646
Object inputs = read_object("inputs.tsv")

pipeline_tools/README.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,12 @@ To run unit tests, first create a virtual environment with the requirements::
4141

4242
Then, run unit tests from the root of the pipeline-tools repo like this::
4343

44-
python -m unittest discover -v
44+
bash test.sh
4545

46+
To run schema integration tests, do::
47+
48+
export TEST_SUITE="latest_schema"
49+
bash test.sh
4650

4751
create_analysis_json.py
4852
=======================

pipeline_tools/dcp_utils.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def get_file_by_uuid(file_id, dss_url):
1919
return response.json()
2020

2121

22-
def get_manifest_files(bundle_uuid, bundle_version, dss_url, timeout_seconds, retry_seconds):
22+
def get_manifest(bundle_uuid, bundle_version, dss_url, timeout_seconds, retry_seconds):
2323
"""
2424
Retrieve manifest.json file for a given bundle uuid and version.
2525
:param str bundle_uuid: Bundle unique id
@@ -48,19 +48,30 @@ def get_manifest_files(bundle_uuid, bundle_version, dss_url, timeout_seconds, re
4848
current = time.time()
4949
manifest = response.json()
5050

51+
return manifest
52+
53+
54+
def get_manifest_file_dicts(manifest):
5155
bundle = manifest['bundle']
5256
name_to_meta = {}
5357
url_to_name = {}
5458
for f in bundle['files']:
5559
name_to_meta[f['name']] = f
5660
url_to_name[f['url']] = f['name']
57-
5861
return {
5962
'name_to_meta': name_to_meta,
6063
'url_to_name': url_to_name
6164
}
6265

6366

67+
def get_file_uuid(manifest_file_dicts, file_name):
68+
return manifest_file_dicts['name_to_meta'][file_name]['uuid']
69+
70+
71+
def get_file_url(manifest_file_dicts, file_name):
72+
return manifest_file_dicts['name_to_meta'][file_name]['url']
73+
74+
6475
def get_auth_token(url="https://danielvaughan.eu.auth0.com/oauth/token",
6576
client_id="Zdsog4nDAnhQ99yiKwMQWAPc2qUDlR99",
6677
client_secret="t-OAE-GQk_nZZtWn-QQezJxDsLXmU7VSzlAh9cKW5vb87i90qlXGTvVNAjfT9weF",

pipeline_tools/input_utils.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
from pipeline_tools import dcp_utils
2+
3+
4+
def get_sample_id(metadata, version='4'):
5+
"""Return the sample id from the given metadata"""
6+
if version == '4':
7+
return _get_sample_id_v4(metadata)
8+
else:
9+
raise NotImplementedError('Only implemented for v4 metadata')
10+
11+
12+
def _get_sample_id_v4(assay_json):
13+
"""Return sample id from assay json"""
14+
return assay_json["has_input"]
15+
16+
17+
def get_input_metadata_file_uuid(manifest_files, version='4'):
18+
"""Get the uuid of the file containing metadata about pipeline input files,
19+
e.g. assay.json in v4"""
20+
if version == '5':
21+
return _get_input_metadata_file_uuid_v5(manifest_files)
22+
elif version == '4':
23+
return _get_input_metadata_file_uuid_v4(manifest_files)
24+
else:
25+
raise NotImplementedError('Only implemented for v4 and v5 metadata')
26+
27+
28+
def _get_input_metadata_file_uuid_v5(manifest_files):
29+
"""Get the uuid of the files.json file"""
30+
return dcp_utils.get_file_uuid(manifest_files, 'files.json')
31+
32+
33+
def _get_input_metadata_file_uuid_v4(manifest_files):
34+
"""Get the uuid of the assay.json file"""
35+
return dcp_utils.get_file_uuid(manifest_files, 'assay.json')
36+
37+
38+
def get_smart_seq_2_fastq_names(metadata, version='4'):
39+
"""Get the fastq file names from the given metadata"""
40+
if version == '5':
41+
return _get_smart_seq_2_fastq_names_v5(metadata)
42+
elif version == '4':
43+
return _get_smart_seq_2_fastq_names_v4(metadata)
44+
else:
45+
raise NotImplementedError('Only implemented for v4 and v5 metadata')
46+
47+
48+
def _get_smart_seq_2_fastq_names_v5(files_json):
49+
"""Return fastq file names from files json"""
50+
index_to_name = {}
51+
for f in files_json['files']:
52+
index = f['content']['read_index']
53+
file_name = f['content']['file_core']['file_name']
54+
index_to_name[index] = file_name
55+
return index_to_name['read1'], index_to_name['read2']
56+
57+
58+
def _get_smart_seq_2_fastq_names_v4(assay_json):
59+
"""Return fastq file names from assay json"""
60+
fastq_1_name = assay_json["content"]["seq"]["lanes"][0]["r1"]
61+
fastq_2_name = assay_json["content"]["seq"]["lanes"][0]["r2"]
62+
return fastq_1_name, fastq_2_name
63+
64+
65+
def get_optimus_lanes(metadata_json, version='4'):
66+
"""Get the lane metadata"""
67+
if version == '4':
68+
return _get_optimus_lanes_v4(metadata_json)
69+
else:
70+
raise NotImplementedError('Only implemented for v4 metadata')
71+
72+
73+
def _get_optimus_lanes_v4(assay_json):
74+
"""Return the lane metadata from the assay json"""
75+
lanes = assay_json['content']['seq']['lanes']
76+
return lanes
77+
78+
79+
def get_optimus_inputs(lanes, manifest_files):
80+
"""Return three lists of urls, representing fastqs for r1, r2, and i1, respectively.
81+
In each list, the first item is for the first lane, the second item is for the second lane, etc.
82+
"""
83+
r1 = [manifest_files['name_to_meta'][lane['r1']]['url'] for lane in lanes]
84+
r2 = [manifest_files['name_to_meta'][lane['r2']]['url'] for lane in lanes]
85+
i1 = [manifest_files['name_to_meta'][lane['i1']]['url'] for lane in lanes]
86+
87+
return r1, r2, i1
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
{
2+
"content": {
3+
"single_cell": {
4+
"cell_handling": "10x_v2",
5+
"cell_barcode": {
6+
"read": "Read 1",
7+
"size": 16,
8+
"white_list_file": "pbmc8k_S1_L007_R1_001.fastq.gz,pbmc8k_S1_L008_R1_001.fastq.gz",
9+
"offset": 0
10+
}
11+
},
12+
"core": {
13+
"type": "assay",
14+
"schema_url": "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.1/json_schema/assay.json",
15+
"schema_version": "4.6.1"
16+
},
17+
"rna": {
18+
"end_bias": "three_prime_end",
19+
"primer": "poly-dT",
20+
"strand": "both",
21+
"library_construction": "10x_v2"
22+
},
23+
"assay_id": "c349cce6-6d63-4976-832e-3c27ca1399ac",
24+
"seq": {
25+
"paired_ends": true,
26+
"lanes": [
27+
{
28+
"i1": "pbmc8k_S1_L007_I1_001.fastq.gz",
29+
"number": 7,
30+
"r2": "pbmc8k_S1_L007_R2_001.fastq.gz",
31+
"r1": "pbmc8k_S1_L007_R1_001.fastq.gz"
32+
},
33+
{
34+
"i1": "pbmc8k_S1_L008_I1_001.fastq.gz",
35+
"number": 8,
36+
"r2": "pbmc8k_S1_L008_R2_001.fastq.gz",
37+
"r1": "pbmc8k_S1_L008_R1_001.fastq.gz"
38+
}
39+
],
40+
"instrument_platform": "Illumina",
41+
"molecule": "polyA RNA",
42+
"instrument_model": "HiSeq 4000",
43+
"umi_barcode": {
44+
"read": "Read 1",
45+
"offset": 16,
46+
"size": 10
47+
}
48+
}
49+
},
50+
"core": {
51+
"type": "assay_bundle",
52+
"schema_url": "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.1/json_schema/assay_bundle.json",
53+
"schema_version": "4.6.1"
54+
},
55+
"has_output": [
56+
"c34f9bda-1621-4596-b93f-797552368282",
57+
"ed7d5ab4-8589-4e50-bb6c-5d4b459b183c",
58+
"9a4a1656-faab-448e-9717-3fb16843a314",
59+
"b7e2cfc0-8d3f-40b4-adf2-3c44112259dc",
60+
"072461ba-e1da-40e2-aa5d-626eedad7fef",
61+
"58ea2f4b-c4af-4b1b-8b6a-484d46d37de5"
62+
],
63+
"hca_ingest": {
64+
"accession": "",
65+
"submissionDate": "2018-01-16T16:23:53.023Z",
66+
"lastModifiedUser": "anonymousUser",
67+
"updateDate": "2018-01-16T16:24:04.590Z",
68+
"document_id": "01425de2-dcd2-479c-899a-b84763767e74",
69+
"user": "anonymousUser"
70+
},
71+
"has_input": "42a6269e-8bc7-47ac-806b-3a53f8ba2a6f"
72+
}

0 commit comments

Comments
 (0)