2020
2121# Extra columns that some databases have (we want to know which ones)
2222EXTRA_COLUMNS = {
23- "channel" , "data_call" , "data_num" , "data_p_val" , "data_p_value" ,
24- "genome" , "genome_id" , "log" , "orthogroup" , "p_val" , "project_id" ,
25- "qvalue" , "sample_file_name" , "sample_tissue" , "version" ,
23+ "channel" ,
24+ "data_call" ,
25+ "data_num" ,
26+ "data_p_val" ,
27+ "data_p_value" ,
28+ "genome" ,
29+ "genome_id" ,
30+ "log" ,
31+ "orthogroup" ,
32+ "p_val" ,
33+ "project_id" ,
34+ "qvalue" ,
35+ "sample_file_name" ,
36+ "sample_tissue" ,
37+ "version" ,
2638}
2739
2840
@@ -94,7 +106,9 @@ def main():
94106
95107 # ---- 2. Group databases by their 3-column signature ----
96108 print ("\n " + "=" * 80 )
97- print ("GROUPING BY SIGNATURE (probeset_type, probeset_nullable, signal_nullable, signal_default, bot_type, bot_nullable)" )
109+ print (
110+ "GROUPING BY SIGNATURE (probeset_type, probeset_nullable, signal_nullable, signal_default, bot_type, bot_nullable)"
111+ )
98112 print ("=" * 80 )
99113
100114 sig_groups = defaultdict (list )
@@ -103,7 +117,9 @@ def main():
103117 sig_groups [sig ].append (db )
104118
105119 for sig , dbs in sorted (sig_groups .items (), key = lambda x : - len (x [1 ])):
106- print (f"\n Signature: probeset={ sig [0 ]} (nullable={ sig [1 ]} ) signal(nullable={ sig [2 ]} , default={ sig [3 ]} ) bot={ sig [4 ]} (nullable={ sig [5 ]} )" )
120+ print (
121+ f"\n Signature: probeset={ sig [0 ]} (nullable={ sig [1 ]} ) signal(nullable={ sig [2 ]} , default={ sig [3 ]} ) bot={ sig [4 ]} (nullable={ sig [5 ]} )"
122+ )
107123 print (f" Count: { len (dbs )} " )
108124 print (f" DBs: { ', ' .join (dbs [:10 ])} { '...' if len (dbs ) > 10 else '' } " )
109125
@@ -135,15 +151,17 @@ def main():
135151 # Determine extra columns this DB needs
136152 extras = set (cols .keys ()) - NEEDED_COLUMNS - {"proj_id" , "sample_id" }
137153
138- compact_entries .append ({
139- "db" : db ,
140- "probeset_len" : probeset_len , # None = tinytext
141- "probeset_type" : probeset_type ,
142- "bot_len" : bot_len , # None = tinytext
143- "bot_type" : bot_type ,
144- "signal_nullable" : signal_nullable ,
145- "extras" : extras ,
146- })
154+ compact_entries .append (
155+ {
156+ "db" : db ,
157+ "probeset_len" : probeset_len , # None = tinytext
158+ "probeset_type" : probeset_type ,
159+ "bot_len" : bot_len , # None = tinytext
160+ "bot_type" : bot_type ,
161+ "signal_nullable" : signal_nullable ,
162+ "extras" : extras ,
163+ }
164+ )
147165
148166 # ---- 4. Show the most compact table-driven representation ----
149167 print ("\n " + "=" * 80 )
@@ -180,9 +198,7 @@ def main():
180198 for e in compact_entries :
181199 # Filter out databases that ONLY have unneeded extras
182200 # (sample_file_name, data_call, data_p_val etc. are not needed)
183- has_important_extras = e ["extras" ] - {
184- "sample_file_name" , "data_call" , "data_p_val" , "data_p_value" , "data_num"
185- }
201+ has_important_extras = e ["extras" ] - {"sample_file_name" , "data_call" , "data_p_val" , "data_p_value" , "data_num" }
186202 if has_important_extras :
187203 complex_dbs .append (e )
188204 else :
@@ -217,11 +233,13 @@ def main():
217233 with open (SAMPLE_DATA_CSV , newline = "" ) as f :
218234 reader = csv .DictReader (f )
219235 for row in reader :
220- db_samples [row ["source_database" ]].append ({
221- "data_bot_id" : row ["data_bot_id" ],
222- "data_probeset_id" : row ["data_probeset_id" ],
223- "data_signal" : row ["data_signal" ],
224- })
236+ db_samples [row ["source_database" ]].append (
237+ {
238+ "data_bot_id" : row ["data_bot_id" ],
239+ "data_probeset_id" : row ["data_probeset_id" ],
240+ "data_signal" : row ["data_signal" ],
241+ }
242+ )
225243
226244 print (f"Total databases with sample data: { len (db_samples )} " )
227245 print (f"Total sample rows: { sum (len (v ) for v in db_samples .values ())} " )
0 commit comments