|
| 1 | + |
| 2 | +import os |
| 3 | +import sys |
| 4 | + |
| 5 | +in_folder=sys.argv[1] |
| 6 | +fastoma_out_folder=sys.argv[2] |
| 7 | + |
| 8 | +``` |
| 9 | +To run |
| 10 | +python HOGcomposition.py in_folder output |
| 11 | +in_folder : where proteome folder is |
| 12 | +output: where RootHOGs.tsv is |
| 13 | + |
| 14 | +output: |
| 15 | +species composition table (HOGs in rows, species in columns and each cell to be the protein names of that HOG for each species) |
| 16 | + |
| 17 | +``` |
| 18 | + |
| 19 | +project_files = os.listdir(in_folder+"/proteome/") |
| 20 | + |
| 21 | + |
| 22 | +fasta_format_keep = "" |
| 23 | +species_names = [] # query/input species name based on the file name |
| 24 | +for file in project_files: |
| 25 | + species_name, ext = file.rsplit('.', 1) |
| 26 | + if ext in ("fa", "fasta"): |
| 27 | + species_names.append(species_name) |
| 28 | + fasta_format_keep = ext # last one is stored either fa or fasta |
| 29 | +print("number of species:", len(species_names)) |
| 30 | + |
| 31 | + |
| 32 | + |
| 33 | +prot2species = {} |
| 34 | +for species_name in species_names: |
| 35 | + prot_address = os.path.join(in_folder+"/proteome/", species_name + "." + fasta_format_keep) |
| 36 | + |
| 37 | + file_prot = open(prot_address,'r') |
| 38 | + for line in file_prot: |
| 39 | + if line.startswith(">"): |
| 40 | + prot_name=line.strip().split(" ")[0][1:] |
| 41 | + prot2species[prot_name] = species_name |
| 42 | +print("total number of proteins in the fasta files",len(prot2species)) |
| 43 | + |
| 44 | +roothog_dic={} |
| 45 | +roothog_file= open(fastoma_out_folder+"/RootHOGs.tsv",'r') |
| 46 | +for line in roothog_file: |
| 47 | + if line.startswith("RootHOG"): |
| 48 | + continue |
| 49 | + roothog,protein, omamerroothog= line.strip().split("\t") |
| 50 | + if roothog in roothog_dic: |
| 51 | + roothog_dic[roothog].append(protein) |
| 52 | + else: |
| 53 | + roothog_dic[roothog]=[protein] |
| 54 | +print("number of HOGs", len(roothog_dic)) |
| 55 | + |
| 56 | + |
| 57 | +file_out=open("HOGcomposition.tsv",'w') |
| 58 | +file_out.write('RootHOG'+'\t'+'\t'.join(species_names)+'\n') |
| 59 | +for roothog, prots in roothog_dic.items(): |
| 60 | + |
| 61 | + prot_species_dic={} |
| 62 | + for species_name in species_names: |
| 63 | + prot_species_dic[species_name]=[] |
| 64 | + |
| 65 | + for prot in prots: |
| 66 | + species_name=prot2species[prot] |
| 67 | + prot_species_dic[species_name].append(prot) |
| 68 | + |
| 69 | + prot_species=[] |
| 70 | + for species_name in species_names: |
| 71 | + prots_raw= prot_species_dic[species_name] |
| 72 | + if prots: |
| 73 | + prot_species.append(str(prots_raw)[1:-1]) |
| 74 | + else: |
| 75 | + prot_species.append('') |
| 76 | + |
| 77 | + |
| 78 | + #prot_species_str=','.join(prot_species) |
| 79 | + file_out.write(roothog+'\t'+'\t'.join(prot_species)+'\n') |
| 80 | + |
| 81 | +file_out.close() |
| 82 | + |
| 83 | +print("output is written as file ", "HOGcomposition.tsv") |
0 commit comments