1- from ftplib import FTP
2- import os
31import bz2
2+ import logging
3+ import os
4+ import sys
45import urllib .request
5- from Bio import AlignIO
6+ from ftplib import FTP
7+ from typing import Union
68
9+ from Bio import AlignIO
710
8- def get_from_ftp (pdb_code , path_to_store = '.' ,
9- ftp_server = 'ftp.cmbi.umcn.nl' , ftp_path = '/pub/molbio/data/hssp/' ):
10- """Downloads using FTP protocol an HSSP alignment for the given pdb_code"""
11+ # Set logging
12+ logger = logging .getLogger ("hssp_log" )
13+ logger .setLevel (logging .INFO )
14+ ch = logging .StreamHandler (sys .stdout )
15+ ch .setLevel (logging .INFO )
16+ formatter = logging .Formatter ("%(name)s [%(levelname)s] %(message)s" )
17+ ch .setFormatter (formatter )
18+ logger .addHandler (ch )
19+
20+
21+ def get_from_ftp (
22+ pdb_code : str ,
23+ path_to_store : str = "." ,
24+ ftp_server : str = "ftp.cmbi.umcn.nl" ,
25+ ftp_path : str = "/pub/molbio/data/hssp/" ,
26+ ) -> Union [str , None ]:
27+ """Downloads an HSSP alignment file for the given pdb_code using FTP protocol.
28+
29+ Args:
30+ pdb_code (str): The PDB code for which to download the HSSP alignment file.
31+ path_to_store (str, optional): The path where the downloaded file should be stored. Defaults to ".".
32+ ftp_server (str, optional): The FTP server to connect to. Defaults to "ftp.cmbi.umcn.nl".
33+ ftp_path (str, optional): The path on the FTP server where the HSSP alignments are stored. Defaults to "/pub/molbio/data/hssp/".
34+
35+ Returns:
36+ Union[str, None]: The path to the downloaded file if successful, None otherwise.
37+ """
38+ path_to_file : str = ""
1139 # Start connection
1240 try :
1341 ftp = FTP (ftp_server )
@@ -16,33 +44,39 @@ def get_from_ftp(pdb_code, path_to_store='.',
1644 # Move to path where HSSP alignments are stored
1745 ftp .cwd (ftp_path )
1846 # File name format
19- file_name = ' {}.hssp.bz2' .format (pdb_code .lower ())
47+ file_name = " {}.hssp.bz2" .format (pdb_code .lower ())
2048 # Retrieve file
2149 path_to_file = os .path .join (path_to_store , file_name )
22- ftp .retrbinary ("RETR " + file_name , open (path_to_file , 'wb' ).write )
50+ ftp .retrbinary ("RETR " + file_name , open (path_to_file , "wb" ).write )
2351 # Close connection
2452 ftp .close ()
2553
2654 return path_to_file
27- except :
28- if os .path .exists (path_to_file ):
55+ except Exception as e :
56+ logger .exception (e )
57+ logger .error ("There was an error downloading the file from the FTP server." )
58+ if path_to_file != "" and os .path .exists (path_to_file ):
2959 os .remove (path_to_file )
3060 return None
3161
32- def get_from_url (pdb_code , path_to_store = '.' ,
33- url = 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/hssp3/' ):
62+
63+ def get_from_url (
64+ pdb_code , path_to_store = "." , url = "ftp://ftp.cmbi.umcn.nl/pub/molbio/data/hssp3/"
65+ ):
3466 """Downloads from HSSP3 online db the HSSP file in stockholm format"""
35- file_name = ' {}.hssp.bz2' .format (pdb_code .lower ())
67+ file_name = " {}.hssp.bz2" .format (pdb_code .lower ())
3668 # Make sure we use hssp3 instead of simple hssp to help identifying them
37- path_to_file = os .path .join (path_to_store , file_name .replace (' hssp' , ' hssp3' ))
69+ path_to_file = os .path .join (path_to_store , file_name .replace (" hssp" , " hssp3" ))
3870 urllib .request .urlretrieve (url + file_name , path_to_file )
3971 return path_to_file
4072
4173
4274def decompress_bz2 (file_name_input , file_name_output ):
43- """Decompresses file_name_input in BZ2 format into file_name_output"""
44- with open (file_name_output , 'wb' ) as new_file , bz2 .BZ2File (file_name_input , 'rb' ) as file :
45- for data in iter (lambda : file .read (100 * 1024 ), b'' ):
75+ """Decompresses file_name_input in BZ2 format into file_name_output"""
76+ with open (file_name_output , "wb" ) as new_file , bz2 .BZ2File (
77+ file_name_input , "rb"
78+ ) as file :
79+ for data in iter (lambda : file .read (100 * 1024 ), b"" ):
4680 new_file .write (data )
4781
4882
@@ -53,7 +87,7 @@ def _parse_hssp_proteins(line_buffer):
5387 if line .startswith (" NR." ) or line .startswith ("##" ):
5488 continue
5589 # Only get the id and name of the protein in the alignment
56- fields = (line [:20 ]).split (':' )
90+ fields = (line [:20 ]).split (":" )
5791 seq_id = int (fields [0 ]) - 1
5892 name = fields [1 ].strip ()[:10 ]
5993 proteins [seq_id ] = name
@@ -67,30 +101,30 @@ def _parse_hssp_alignments(line_buffer, chain_id, num_alignments):
67101 last_alignment = 0
68102 current_num_alignments = 0
69103 for line in line_buffer :
70- if line .startswith (" SeqNo" ) or line [12 ] == '!' :
104+ if line .startswith (" SeqNo" ) or line [12 ] == "!" :
71105 continue
72106 if line .startswith ("## ALIGNMENTS" ):
73- fields = (line [13 :]).split ('-' )
107+ fields = (line [13 :]).split ("-" )
74108 # We are now parsing alignments from first to last specified
75109 # in the ALINGMENTS header
76110 first_alignment = int (fields [0 ]) - 1
77111 last_alignment = int (fields [1 ]) - 1
78112 current_num_alignments = last_alignment - first_alignment + 1
79113 else :
80- if line [12 ] == chain_id and line [14 ] != 'X' :
81- for i , s in enumerate (line [51 : 51 + current_num_alignments ]):
114+ if line [12 ] == chain_id and line [14 ] != "X" :
115+ for i , s in enumerate (line [51 : 51 + current_num_alignments ]):
82116 # We will convert spaces or dots to -
83- if s == '.' or s == ' ' :
84- s = '-'
117+ if s == "." or s == " " :
118+ s = "-"
85119 # We leave residues in minor case as if to not forget insertions
86120 alignments [first_alignment + i ].append (s )
87- alignments = [('' .join (s )) for s in alignments ]
121+ alignments = [("" .join (s )) for s in alignments ]
88122 return alignments
89123
90124
91125def hssp_file_to_phylip (hssp_file_name , phylip_file_name , chain_id , master_sequence ):
92126 """Parses an HSSP file and returns a list of the sequences"""
93- # We're only interested in the lenght of the sequence of our given chain_id,
127+ # We're only interested in the lenght of the sequence of our given chain_id,
94128 # SEQLENGHT header gives us the sum of all.
95129 seqlength = len (master_sequence )
96130 num_alignments = 0
@@ -103,13 +137,13 @@ def hssp_file_to_phylip(hssp_file_name, phylip_file_name, chain_id, master_seque
103137 with open (hssp_file_name , "rU" ) as handle :
104138 for line in handle :
105139 line = line .rstrip (os .linesep )
106- if line .startswith (' NCHAIN' ):
140+ if line .startswith (" NCHAIN" ):
107141 num_chains = int (line .split ()[1 ])
108- if line .startswith (' NALIGN' ):
142+ if line .startswith (" NALIGN" ):
109143 num_alignments = int (line .split ()[1 ])
110-
111- parsing = ( seqlength != 0 and num_chains != 0 and num_alignments != 0 )
112-
144+
145+ parsing = seqlength != 0 and num_chains != 0 and num_alignments != 0
146+
113147 if parsing :
114148 if line .startswith ("## ALIGNMENTS" ):
115149 parsing_alignment = True
@@ -130,33 +164,45 @@ def hssp_file_to_phylip(hssp_file_name, phylip_file_name, chain_id, master_seque
130164 prot_line_buffer .append (line )
131165
132166 proteins = _parse_hssp_proteins (prot_line_buffer )
133- alignments = _parse_hssp_alignments (line_buffer , chain_id .upper (), num_alignments )
167+ alignments = _parse_hssp_alignments (
168+ line_buffer , chain_id .upper (), num_alignments
169+ )
134170
135- all_zero = ( sum ([len (a ) for a in alignments ]) == 0 )
171+ all_zero = sum ([len (a ) for a in alignments ]) == 0
136172
137173 if all_zero :
138- raise Exception ("Not a single alignment found for chain {}" .format (chain_id ))
139-
140- non_valid = [k for k in proteins .keys () if alignments [k ].count ('-' ) >= seqlength ]
141- with open (phylip_file_name , 'w' ) as output_handle :
174+ raise Exception (
175+ "Not a single alignment found for chain {}" .format (chain_id )
176+ )
177+
178+ non_valid = [
179+ k for k in proteins .keys () if alignments [k ].count ("-" ) >= seqlength
180+ ]
181+ with open (phylip_file_name , "w" ) as output_handle :
142182 # Write header, MASTER also counts
143- output_handle .write ("{} {}{}" .format (len (proteins ) - len (non_valid ) + 1 , seqlength , os .linesep ))
183+ output_handle .write (
184+ "{} {}{}" .format (
185+ len (proteins ) - len (non_valid ) + 1 , seqlength , os .linesep
186+ )
187+ )
144188 # Write master sequence
145189 output_handle .write ("MASTER {}{}" .format (master_sequence , os .linesep ))
146190 # Write the rest of non null alignments
147191 for k in sorted (proteins .keys ()):
148192 if k not in non_valid :
149- output_handle .write ("{:10s}{}{}" .format (proteins [k ], alignments [k ], os .linesep ))
193+ output_handle .write (
194+ "{:10s}{}{}" .format (proteins [k ], alignments [k ], os .linesep )
195+ )
150196
151197
152198def hssp3_file_to_phylip (hssp3_file_name , phylip_file_name , chain_id , master_sequence ):
153199 """Reads a HSSP file in stockholm format and writes a new msa file in phylip-sequential format
154200 only containing the given chain"""
155- alignments = list (AlignIO .parse (hssp3_file_name , format = ' stockholm' ))
201+ alignments = list (AlignIO .parse (hssp3_file_name , format = " stockholm" ))
156202 for align in alignments :
157- if align [0 ].name [4 ] == '/' :
203+ if align [0 ].name [4 ] == "/" :
158204 chain = align [0 ].name [5 ].upper ()
159205 if chain == chain_id :
160- align [0 ].id = align [0 ].name = align [0 ].description = ' MASTER'
161- #align[0].seq = align[0].seq.ungap('-')
162- AlignIO .write (align , phylip_file_name , format = ' phylip-sequential' )
206+ align [0 ].id = align [0 ].name = align [0 ].description = " MASTER"
207+ # align[0].seq = align[0].seq.ungap('-')
208+ AlignIO .write (align , phylip_file_name , format = " phylip-sequential" )
0 commit comments