Skip to content
This repository was archived by the owner on Jan 22, 2026. It is now read-only.

Commit 4aaa485

Browse files
authored
Merge pull request #10 from haddocking/9-unbound-variable-in-hsspget_from_ftp
Refactor `get_from_ftp` to remove unbound variable
2 parents 5382a32 + 8504d86 commit 4aaa485

File tree

1 file changed

+90
-44
lines changed

1 file changed

+90
-44
lines changed

libwhiscy/hssp.py

Lines changed: 90 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,41 @@
1-
from ftplib import FTP
2-
import os
31
import bz2
2+
import logging
3+
import os
4+
import sys
45
import urllib.request
5-
from Bio import AlignIO
6+
from ftplib import FTP
7+
from typing import Union
68

9+
from Bio import AlignIO
710

8-
def get_from_ftp(pdb_code, path_to_store='.',
9-
ftp_server='ftp.cmbi.umcn.nl', ftp_path='/pub/molbio/data/hssp/'):
10-
"""Downloads using FTP protocol an HSSP alignment for the given pdb_code"""
11+
# Set logging
12+
logger = logging.getLogger("hssp_log")
13+
logger.setLevel(logging.INFO)
14+
ch = logging.StreamHandler(sys.stdout)
15+
ch.setLevel(logging.INFO)
16+
formatter = logging.Formatter("%(name)s [%(levelname)s] %(message)s")
17+
ch.setFormatter(formatter)
18+
logger.addHandler(ch)
19+
20+
21+
def get_from_ftp(
22+
pdb_code: str,
23+
path_to_store: str = ".",
24+
ftp_server: str = "ftp.cmbi.umcn.nl",
25+
ftp_path: str = "/pub/molbio/data/hssp/",
26+
) -> Union[str, None]:
27+
"""Downloads an HSSP alignment file for the given pdb_code using FTP protocol.
28+
29+
Args:
30+
pdb_code (str): The PDB code for which to download the HSSP alignment file.
31+
path_to_store (str, optional): The path where the downloaded file should be stored. Defaults to ".".
32+
ftp_server (str, optional): The FTP server to connect to. Defaults to "ftp.cmbi.umcn.nl".
33+
ftp_path (str, optional): The path on the FTP server where the HSSP alignments are stored. Defaults to "/pub/molbio/data/hssp/".
34+
35+
Returns:
36+
Union[str, None]: The path to the downloaded file if successful, None otherwise.
37+
"""
38+
path_to_file: str = ""
1139
# Start connection
1240
try:
1341
ftp = FTP(ftp_server)
@@ -16,33 +44,39 @@ def get_from_ftp(pdb_code, path_to_store='.',
1644
# Move to path where HSSP alignments are stored
1745
ftp.cwd(ftp_path)
1846
# File name format
19-
file_name = '{}.hssp.bz2'.format(pdb_code.lower())
47+
file_name = "{}.hssp.bz2".format(pdb_code.lower())
2048
# Retrieve file
2149
path_to_file = os.path.join(path_to_store, file_name)
22-
ftp.retrbinary("RETR " + file_name, open(path_to_file, 'wb').write)
50+
ftp.retrbinary("RETR " + file_name, open(path_to_file, "wb").write)
2351
# Close connection
2452
ftp.close()
2553

2654
return path_to_file
27-
except:
28-
if os.path.exists(path_to_file):
55+
except Exception as e:
56+
logger.exception(e)
57+
logger.error("There was an error downloading the file from the FTP server.")
58+
if path_to_file != "" and os.path.exists(path_to_file):
2959
os.remove(path_to_file)
3060
return None
3161

32-
def get_from_url(pdb_code, path_to_store='.',
33-
url='ftp://ftp.cmbi.umcn.nl/pub/molbio/data/hssp3/'):
62+
63+
def get_from_url(
64+
pdb_code, path_to_store=".", url="ftp://ftp.cmbi.umcn.nl/pub/molbio/data/hssp3/"
65+
):
3466
"""Downloads from HSSP3 online db the HSSP file in stockholm format"""
35-
file_name = '{}.hssp.bz2'.format(pdb_code.lower())
67+
file_name = "{}.hssp.bz2".format(pdb_code.lower())
3668
# Make sure we use hssp3 instead of simple hssp to help identifying them
37-
path_to_file = os.path.join(path_to_store, file_name.replace('hssp', 'hssp3'))
69+
path_to_file = os.path.join(path_to_store, file_name.replace("hssp", "hssp3"))
3870
urllib.request.urlretrieve(url + file_name, path_to_file)
3971
return path_to_file
4072

4173

4274
def decompress_bz2(file_name_input, file_name_output):
43-
"""Decompresses file_name_input in BZ2 format into file_name_output"""
44-
with open(file_name_output, 'wb') as new_file, bz2.BZ2File(file_name_input, 'rb') as file:
45-
for data in iter(lambda : file.read(100 * 1024), b''):
75+
"""Decompresses file_name_input in BZ2 format into file_name_output"""
76+
with open(file_name_output, "wb") as new_file, bz2.BZ2File(
77+
file_name_input, "rb"
78+
) as file:
79+
for data in iter(lambda: file.read(100 * 1024), b""):
4680
new_file.write(data)
4781

4882

@@ -53,7 +87,7 @@ def _parse_hssp_proteins(line_buffer):
5387
if line.startswith(" NR.") or line.startswith("##"):
5488
continue
5589
# Only get the id and name of the protein in the alignment
56-
fields = (line[:20]).split(':')
90+
fields = (line[:20]).split(":")
5791
seq_id = int(fields[0]) - 1
5892
name = fields[1].strip()[:10]
5993
proteins[seq_id] = name
@@ -67,30 +101,30 @@ def _parse_hssp_alignments(line_buffer, chain_id, num_alignments):
67101
last_alignment = 0
68102
current_num_alignments = 0
69103
for line in line_buffer:
70-
if line.startswith(" SeqNo") or line[12] == '!':
104+
if line.startswith(" SeqNo") or line[12] == "!":
71105
continue
72106
if line.startswith("## ALIGNMENTS"):
73-
fields = (line[13:]).split('-')
107+
fields = (line[13:]).split("-")
74108
# We are now parsing alignments from first to last specified
75109
# in the ALINGMENTS header
76110
first_alignment = int(fields[0]) - 1
77111
last_alignment = int(fields[1]) - 1
78112
current_num_alignments = last_alignment - first_alignment + 1
79113
else:
80-
if line[12] == chain_id and line[14] != 'X':
81-
for i, s in enumerate(line[51:51+current_num_alignments]):
114+
if line[12] == chain_id and line[14] != "X":
115+
for i, s in enumerate(line[51 : 51 + current_num_alignments]):
82116
# We will convert spaces or dots to -
83-
if s == '.' or s == ' ':
84-
s = '-'
117+
if s == "." or s == " ":
118+
s = "-"
85119
# We leave residues in minor case as if to not forget insertions
86120
alignments[first_alignment + i].append(s)
87-
alignments = [(''.join(s)) for s in alignments]
121+
alignments = [("".join(s)) for s in alignments]
88122
return alignments
89123

90124

91125
def hssp_file_to_phylip(hssp_file_name, phylip_file_name, chain_id, master_sequence):
92126
"""Parses an HSSP file and returns a list of the sequences"""
93-
# We're only interested in the lenght of the sequence of our given chain_id,
127+
# We're only interested in the lenght of the sequence of our given chain_id,
94128
# SEQLENGHT header gives us the sum of all.
95129
seqlength = len(master_sequence)
96130
num_alignments = 0
@@ -103,13 +137,13 @@ def hssp_file_to_phylip(hssp_file_name, phylip_file_name, chain_id, master_seque
103137
with open(hssp_file_name, "rU") as handle:
104138
for line in handle:
105139
line = line.rstrip(os.linesep)
106-
if line.startswith('NCHAIN'):
140+
if line.startswith("NCHAIN"):
107141
num_chains = int(line.split()[1])
108-
if line.startswith('NALIGN'):
142+
if line.startswith("NALIGN"):
109143
num_alignments = int(line.split()[1])
110-
111-
parsing = (seqlength != 0 and num_chains != 0 and num_alignments != 0)
112-
144+
145+
parsing = seqlength != 0 and num_chains != 0 and num_alignments != 0
146+
113147
if parsing:
114148
if line.startswith("## ALIGNMENTS"):
115149
parsing_alignment = True
@@ -130,33 +164,45 @@ def hssp_file_to_phylip(hssp_file_name, phylip_file_name, chain_id, master_seque
130164
prot_line_buffer.append(line)
131165

132166
proteins = _parse_hssp_proteins(prot_line_buffer)
133-
alignments = _parse_hssp_alignments(line_buffer, chain_id.upper(), num_alignments)
167+
alignments = _parse_hssp_alignments(
168+
line_buffer, chain_id.upper(), num_alignments
169+
)
134170

135-
all_zero = (sum([len(a) for a in alignments]) == 0)
171+
all_zero = sum([len(a) for a in alignments]) == 0
136172

137173
if all_zero:
138-
raise Exception("Not a single alignment found for chain {}".format(chain_id))
139-
140-
non_valid = [k for k in proteins.keys() if alignments[k].count('-') >= seqlength]
141-
with open(phylip_file_name, 'w') as output_handle:
174+
raise Exception(
175+
"Not a single alignment found for chain {}".format(chain_id)
176+
)
177+
178+
non_valid = [
179+
k for k in proteins.keys() if alignments[k].count("-") >= seqlength
180+
]
181+
with open(phylip_file_name, "w") as output_handle:
142182
# Write header, MASTER also counts
143-
output_handle.write("{} {}{}".format(len(proteins) - len(non_valid) + 1, seqlength, os.linesep))
183+
output_handle.write(
184+
"{} {}{}".format(
185+
len(proteins) - len(non_valid) + 1, seqlength, os.linesep
186+
)
187+
)
144188
# Write master sequence
145189
output_handle.write("MASTER {}{}".format(master_sequence, os.linesep))
146190
# Write the rest of non null alignments
147191
for k in sorted(proteins.keys()):
148192
if k not in non_valid:
149-
output_handle.write("{:10s}{}{}".format(proteins[k], alignments[k], os.linesep))
193+
output_handle.write(
194+
"{:10s}{}{}".format(proteins[k], alignments[k], os.linesep)
195+
)
150196

151197

152198
def hssp3_file_to_phylip(hssp3_file_name, phylip_file_name, chain_id, master_sequence):
153199
"""Reads a HSSP file in stockholm format and writes a new msa file in phylip-sequential format
154200
only containing the given chain"""
155-
alignments = list(AlignIO.parse(hssp3_file_name, format='stockholm'))
201+
alignments = list(AlignIO.parse(hssp3_file_name, format="stockholm"))
156202
for align in alignments:
157-
if align[0].name[4] == '/':
203+
if align[0].name[4] == "/":
158204
chain = align[0].name[5].upper()
159205
if chain == chain_id:
160-
align[0].id = align[0].name = align[0].description = 'MASTER'
161-
#align[0].seq = align[0].seq.ungap('-')
162-
AlignIO.write(align, phylip_file_name, format='phylip-sequential')
206+
align[0].id = align[0].name = align[0].description = "MASTER"
207+
# align[0].seq = align[0].seq.ungap('-')
208+
AlignIO.write(align, phylip_file_name, format="phylip-sequential")

0 commit comments

Comments
 (0)