11import argparse
2- import pathlib
2+ import gzip
33import os
4+ import pathlib
45
56import pandas as pd
67
78
89class FeatureCountCollator :
9- def __init__ (self , count_dir , prefix , column , recursive = False ):
10+ def __init__ (self , count_dir , prefix , column , recursive = False , suffix = ".txt.gz" ):
1011 self .count_dir = count_dir
1112 self .prefix = prefix
13+ self .suffix = suffix
1214 self .column = column
1315 self .categories = {}
1416 self ._collect_count_files (recursive = recursive )
1517
1618 @staticmethod
17- def is_valid_file (f ):
19+ def is_valid_file (f , suffix ):
1820 return all ((
19- f .endswith (".txt" ),
20- not f .endswith (".seqname.dist1.txt " ),
21- not f .endswith (".seqname.uniq.txt " ),
22- not f .endswith (".gene_counts.txt " ),
23- not f .endswith (".ambig_tmp.txt " ),
21+ f .endswith (suffix ),
22+ not f .endswith (f ".seqname.dist1{ suffix } " ),
23+ not f .endswith (f ".seqname.uniq{ suffix } " ),
24+ not f .endswith (f ".gene_counts{ suffix } " ),
25+ not f .endswith (f ".ambig_tmp{ suffix } " ),
2426 ))
2527
2628 def _collect_count_files (self , recursive = False ):
2729 all_files = []
2830 for pwd , _ , files in os .walk (self .count_dir ):
29- all_files += (os .path .join (pwd , f ) for f in files if FeatureCountCollator .is_valid_file (f ))
31+ all_files += (os .path .join (pwd , f ) for f in files if FeatureCountCollator .is_valid_file (f , self . suffix ))
3032 if not recursive :
3133 break
3234
3335 for f in all_files :
34- sample , category = os .path .splitext (os .path .basename (f ).replace (".txt" , "" ))
36+ sample , category = os .path .splitext (os .path .basename (f ).replace (self . suffix , "" ))
3537 self .categories .setdefault (category [1 :], []).append ((sample , f ))
3638
3739 def collate (self ):
@@ -40,18 +42,18 @@ def collate(self):
4042 self ._collate_category (category , sorted (files ))
4143
4244 def _collate_category (self , category , files ):
43- with open ( f"{ self .prefix } .{ category } .{ self .column } .txt" , "wt" ) as table_out :
44- index = set ()
45- for _ , fn in files :
46- with open (fn ) as _in :
47- index .update (row .strip ().split ("\t " )[0 ] for row in _in if row .strip ())
48- merged_tab = pd .DataFrame (index = ['unannotated' ] + sorted (index .difference ({'feature' , 'unannotated' })))
49- for sample , fn in files :
50- src_tab = pd .read_csv (fn , sep = "\t " , index_col = 0 )
51- merged_tab = merged_tab .merge (src_tab [self .column ], left_index = True , right_index = True , how = "outer" )
52- merged_tab .rename (columns = {self .column : sample }, inplace = True )
53- merged_tab [sample ]["unannotated" ] = src_tab ["uniq_raw" ]["unannotated" ]
54- merged_tab .to_csv (table_out , sep = "\t " , na_rep = "NA" , index_label = "feature" )
45+ table_file = f"{ self .prefix } .{ category } .{ self .column } .txt.gz"
46+ index = set ()
47+ for _ , fn in files :
48+ with gzip . open (fn , "rt" ) as _in :
49+ index .update (row .strip ().split ("\t " )[0 ] for row in _in if row .strip ())
50+ merged_tab = pd .DataFrame (index = ['unannotated' ] + sorted (index .difference ({'feature' , 'unannotated' })))
51+ for sample , fn in files :
52+ src_tab = pd .read_csv (fn , sep = "\t " , index_col = 0 )
53+ merged_tab = merged_tab .merge (src_tab [self .column ], left_index = True , right_index = True , how = "outer" )
54+ merged_tab .rename (columns = {self .column : sample }, inplace = True )
55+ merged_tab [sample ]["unannotated" ] = src_tab ["uniq_raw" ]["unannotated" ]
56+ merged_tab .to_csv (table_file , sep = "\t " , na_rep = "NA" , index_label = "feature" )
5557
5658
5759def main ():
0 commit comments