11from pipeline_tools import dcp_utils
22
33
4- def get_sample_id (metadata , version = '4' ):
4+ def get_sample_id (metadata , version ):
55 """Return the sample id from the given metadata"""
6- if version == '4' :
6+ if version . startswith ( '4.' ) :
77 return _get_sample_id_v4 (metadata )
8+ elif version .startswith ('5.' ):
9+ return _get_sample_id_v5 (metadata )
810 else :
9- raise NotImplementedError ('Only implemented for v4 metadata' )
11+ raise NotImplementedError ('Only implemented for v4 and v5 metadata' )
12+
13+
14+ def _get_sample_id_v5 (links_json ):
15+ """Return sample id from links json"""
16+ links = links_json ['links' ]
17+ def is_sample_link (link ):
18+ return link ['source_type' ] == 'biomaterial' and link ['destination_type' ] == 'sequencing_process'
19+ sample_links = list (filter (lambda x : is_sample_link (x ), links ))
20+ num_links = len (sample_links )
21+ if num_links == 0 :
22+ raise ValueError ('No sample link found' )
23+ elif num_links > 1 :
24+ raise ValueError ('Expecting one sample link. {0} found' .format (num_links ))
25+ return sample_links [0 ]['source_id' ]
1026
1127
1228def _get_sample_id_v4 (assay_json ):
1329 """Return sample id from assay json"""
1430 return assay_json ["has_input" ]
1531
1632
17- def get_input_metadata_file_uuid (manifest_files , version = '4' ):
33+ def get_input_metadata_file_uuid (manifest_files , version ):
1834 """Get the uuid of the file containing metadata about pipeline input files,
1935 e.g. assay.json in v4"""
20- if version == '5' :
36+ if version . startswith ( '5.' ) :
2137 return _get_input_metadata_file_uuid_v5 (manifest_files )
22- elif version == '4' :
38+ elif version . startswith ( '4.' ) :
2339 return _get_input_metadata_file_uuid_v4 (manifest_files )
2440 else :
2541 raise NotImplementedError ('Only implemented for v4 and v5 metadata' )
2642
2743
2844def _get_input_metadata_file_uuid_v5 (manifest_files ):
2945 """Get the uuid of the files.json file"""
30- return dcp_utils .get_file_uuid (manifest_files , 'files .json' )
46+ return dcp_utils .get_file_uuid (manifest_files , 'file .json' )
3147
3248
3349def _get_input_metadata_file_uuid_v4 (manifest_files ):
3450 """Get the uuid of the assay.json file"""
3551 return dcp_utils .get_file_uuid (manifest_files , 'assay.json' )
3652
3753
38- def get_smart_seq_2_fastq_names (metadata , version = '4' ):
54+ def get_sample_id_file_uuid (manifest_files , version ):
55+ """Get the uuid of the file containing the sample id,
56+ e.g. assay.json in v4"""
57+ if version .startswith ('5.' ):
58+ return _get_sample_id_file_uuid_v5 (manifest_files )
59+ elif version .startswith ('4.' ):
60+ return _get_sample_id_file_uuid_v4 (manifest_files )
61+ else :
62+ raise NotImplementedError ('Only implemented for v4 and v5 metadata' )
63+
64+
65+ def _get_sample_id_file_uuid_v5 (manifest_files ):
66+ """Get the uuid of the links.json file"""
67+ return dcp_utils .get_file_uuid (manifest_files , 'links.json' )
68+
69+
70+ def _get_sample_id_file_uuid_v4 (manifest_files ):
71+ """Get the uuid of the assay.json file"""
72+ return dcp_utils .get_file_uuid (manifest_files , 'assay.json' )
73+
74+
75+ def get_smart_seq_2_fastq_names (metadata , version ):
3976 """Get the fastq file names from the given metadata"""
40- if version == '5' :
77+ if version . startswith ( '5.' ) :
4178 return _get_smart_seq_2_fastq_names_v5 (metadata )
42- elif version == '4' :
79+ elif version . startswith ( '4.' ) :
4380 return _get_smart_seq_2_fastq_names_v4 (metadata )
4481 else :
4582 raise NotImplementedError ('Only implemented for v4 and v5 metadata' )
@@ -62,9 +99,9 @@ def _get_smart_seq_2_fastq_names_v4(assay_json):
6299 return fastq_1_name , fastq_2_name
63100
64101
65- def get_optimus_lanes (metadata_json , version = '4' ):
102+ def get_optimus_lanes (metadata_json , version ):
66103 """Get the lane metadata"""
67- if version == '4' :
104+ if version . startswith ( '4.' ) :
68105 return _get_optimus_lanes_v4 (metadata_json )
69106 else :
70107 raise NotImplementedError ('Only implemented for v4 metadata' )
@@ -85,3 +122,89 @@ def get_optimus_inputs(lanes, manifest_files):
85122 i1 = [manifest_files ['name_to_meta' ][lane ['i1' ]]['url' ] for lane in lanes ]
86123
87124 return r1 , r2 , i1
125+
126+
127+ def is_v5_or_higher (name_to_meta ):
128+ """Return true if schema of bundle metadata is at least 5.0.0"""
129+ if 'assay.json' in name_to_meta :
130+ return False
131+ elif 'file.json' in name_to_meta :
132+ return True
133+ else :
134+ raise ValueError ('No assay.json and no process.json. Cannot determine metadata schema version.' )
135+
136+
137+ def detect_schema_version (file_json ):
138+ """Return the bundle's metadata schema version"""
139+ files = file_json ['files' ]
140+ if len (files ) == 0 :
141+ raise ValueError ('No files in bundle' )
142+ schema_url = files [0 ]['content' ]['describedBy' ]
143+ version = schema_url .split ('/' )[- 2 ]
144+ return version
145+
146+
147+ def get_metadata_to_process (manifest_files , dss_url , is_v5_or_higher ):
148+ """Return the metadata json that we need to parse to set up pipeline inputs"""
149+ if not is_v5_or_higher :
150+ schema_version = '4.x'
151+ sample_id_file_uuid = dcp_utils .get_file_uuid (manifest_files , 'sample.json' )
152+ inputs_metadata_file_uuid = dcp_utils .get_file_uuid (manifest_files , 'assay.json' )
153+ inputs_metadata_json = dcp_utils .get_file_by_uuid (inputs_metadata_file_uuid , dss_url )
154+ sample_id_file_json = inputs_metadata_json
155+ else :
156+ sample_id_file_uuid = dcp_utils .get_file_uuid (manifest_files , 'links.json' )
157+ inputs_metadata_file_uuid = dcp_utils .get_file_uuid (manifest_files , 'file.json' )
158+ inputs_metadata_json = dcp_utils .get_file_by_uuid (inputs_metadata_file_uuid , dss_url )
159+ schema_version = detect_schema_version (inputs_metadata_json )
160+ sample_id_file_json = dcp_utils .get_file_by_uuid (sample_id_file_uuid , dss_url )
161+ return inputs_metadata_json , sample_id_file_json , schema_version
162+
163+
164+ def create_ss2_input_tsv (uuid , version , dss_url , retry_seconds , timeout_seconds ):
165+ """Create tsv of Smart-seq2 inputs"""
166+ # Get bundle manifest
167+ print ("Getting bundle manifest for id {0}, version {1}" .format (uuid , version ))
168+ manifest = dcp_utils .get_manifest (uuid , version , dss_url , timeout_seconds , retry_seconds )
169+ manifest_files = dcp_utils .get_manifest_file_dicts (manifest )
170+
171+ inputs_metadata_json , sample_id_file_json , schema_version = get_metadata_to_process (
172+ manifest_files , dss_url , is_v5_or_higher (manifest_files ['name_to_meta' ]))
173+
174+ sample_id = get_sample_id (sample_id_file_json , schema_version )
175+ fastq_1_name , fastq_2_name = get_smart_seq_2_fastq_names (inputs_metadata_json , schema_version )
176+ fastq_1_url = dcp_utils .get_file_url (manifest_files , fastq_1_name )
177+ fastq_2_url = dcp_utils .get_file_url (manifest_files , fastq_2_name )
178+
179+ print ("Creating input map" )
180+ with open ("inputs.tsv" , "w" ) as f :
181+ f .write ("fastq_1\t fastq_2\t sample_id\n " )
182+ f .write ("{0}\t {1}\t {2}\n " .format (fastq_1_url , fastq_2_url , sample_id ))
183+ print ("Wrote input map" )
184+
185+
186+ def create_optimus_input_tsv (uuid , version , dss_url , retry_seconds , timeout_seconds ):
187+ """Create tsv of Optimus inputs"""
188+ # Get bundle manifest
189+ print ('Getting bundle manifest for id {0}, version {1}' .format (uuid , version ))
190+ manifest = dcp_utils .get_manifest (uuid , version , dss_url , timeout_seconds , retry_seconds )
191+ manifest_files = dcp_utils .get_manifest_file_dicts (manifest )
192+
193+ inputs_metadata_json , sample_id_file_json , schema_version = get_metadata_to_process (
194+ manifest_files , dss_url , is_v5_or_higher (manifest_files ['name_to_meta' ]))
195+
196+ # Parse inputs from metadata and write to fastq_inputs
197+ print ('Writing fastq inputs to fastq_inputs.tsv' )
198+ sample_id = get_sample_id (sample_id_file_json , schema_version )
199+ lanes = get_optimus_lanes (input_metadata_json , schema_version )
200+ r1 , r2 , i1 = get_optimus_inputs (lanes , manifest_files )
201+ fastq_inputs = [list (i ) for i in zip (r1 , r2 , i1 )]
202+ print (fastq_inputs )
203+
204+ with open ('fastq_inputs.tsv' , 'w' ) as f :
205+ for line in fastq_inputs :
206+ f .write ('\t ' .join (line ) + '\n ' )
207+ print ('Writing sample ID to inputs.tsv' )
208+ f .write ('{0}' .format (sample_id ))
209+
210+ print ('Wrote input map' )
0 commit comments