Skip to content

Commit 26f25a0

Browse files
Merge pull request #117
Improved `pdb_selaltloc`
2 parents 881d4ff + bf1e254 commit 26f25a0

File tree

2 files changed

+428
-109
lines changed

2 files changed

+428
-109
lines changed

pdbtools/pdb_selaltloc.py

Lines changed: 246 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
data to another. They are based on old FORTRAN77 code that was taking too much
4040
effort to maintain and compile. RIP.
4141
"""
42-
4342
import operator
4443
import os
4544
import sys
@@ -113,8 +112,21 @@ def check_input(args):
113112

114113

115114
def select_by_occupancy(fhandle):
115+
return select_altloc(fhandle, selloc=None, byocc=True)
116+
117+
118+
def select_by_altloc(fhandle, selloc):
119+
return select_altloc(fhandle, selloc, byocc=False)
120+
121+
122+
def select_altloc(fhandle, selloc=None, byocc=False):
116123
"""
117-
Pick the altloc with the highest occupancy.
124+
Pick one altloc when atoms have more than one.
125+
126+
If the specified altloc (selloc) is not present for this particular
127+
atom, outputs all altlocs. For instance, if atom X has altlocs A and
128+
B but the user picked C, we return A and B anyway. If atom Y has
129+
altlocs A, B, and C, then we only return C.
118130
119131
This function is a generator.
120132
@@ -125,133 +137,259 @@ def select_by_occupancy(fhandle):
125137
Yields
126138
------
127139
str (line-by-line)
128-
The PDB file with altlocs of highest occupancy only.
140+
The PDB file with altlocs according to selection.
129141
"""
130-
atom_prop = {}
131-
atom_prop_setd = atom_prop.setdefault
132-
atom_data = []
133-
atom_data_append = atom_data.append
134-
anisou_lines = {} # map atom_uid to lineno
135-
ignored = set()
136-
ignored_add = ignored.add
137-
ignored_discard = ignored.discard
138-
ignored_update = ignored.update
139-
140-
# Iterate over file and store atom_uid
141-
records = ('ATOM', 'HETATM', 'ANISOU')
142-
for lineno, line in enumerate(fhandle):
142+
if selloc is None and not byocc:
143+
raise ValueError('Provide either `selloc` or `byocc`.')
143144

144-
atom_data_append(line)
145+
altloc_lines = {} # dict to capture the lines from a altloc group
146+
res_per_loc = {} # dict to capture the residues per altloc group
145147

146-
if line.startswith(records):
147-
# Sometimes altlocs are used between different residue names.
148-
# See 3u7t (residue 22 of chain A). So we ignore the resname below.
149-
atom_uid = (line[12:16], line[20:26])
148+
prev_altloc = ''
149+
prev_resname = ''
150+
prev_resnum = ''
150151

151-
# ANISOU records do not have occupancy values.
152-
# To keep things simple, we map ANISOU to ATOM/HETATM records
153-
if line.startswith('ANISOU'):
154-
anisou_lines[lineno - 1] = lineno
155-
ignored_add(lineno) # we will fix this below
156-
else:
157-
occ = float(line[54:60])
158-
atom_prop_l = atom_prop_setd(atom_uid, [])
159-
atom_prop_l.append((lineno, occ))
152+
flush_func_multi_residues = flush_resloc_occ if byocc else flush_resloc
153+
flush_func_single_residues = \
154+
flush_resloc_occ_same_residue if byocc else flush_resloc_id_same_residue
160155

161-
# Iterate and pick highest occupancy for each atom.
162-
for atom_uid, prop_list in atom_prop.items():
163-
prop_list.sort(key=operator.itemgetter(1), reverse=True)
156+
records = ('ATOM', 'HETATM', 'ANISOU')
157+
terminators = ('TER', 'END', 'CONECT', 'END', 'ENDMDL')
164158

165-
lineno = prop_list[0][0]
159+
for line in fhandle:
166160

167-
# Edit altloc field(s)
168-
line = atom_data[lineno]
169-
atom_data[lineno] = line[:16] + ' ' + line[17:]
161+
if line.startswith(records):
162+
# captures the relevant parameters
163+
altloc = line[16]
164+
resname = line[17:20]
165+
resnum = line[22:26].strip()
166+
167+
if is_another_altloc_group(
168+
altloc, prev_altloc, resnum, prev_resnum,
169+
resname, prev_resname, altloc_lines, res_per_loc):
170+
# if we see the altloc group has changed, we should flush
171+
# the lines observed for the previous altloc group
172+
173+
# uses for loop instead of "yield from" to maintain compatibility
174+
# with older python version
175+
if partial_altloc(altloc_lines):
176+
flush_func = flush_func_single_residues
177+
else:
178+
flush_func = flush_func_multi_residues
179+
180+
for __line in flush_func(selloc=selloc, altloc_lines=altloc_lines, res_per_loc=res_per_loc):
181+
yield __line
182+
183+
# saves the line per altloc identifier
184+
current_loc = altloc_lines.setdefault(altloc, [])
185+
current_loc.append(line)
186+
187+
# registers which residues are seen for each identifier
188+
rploc = res_per_loc.setdefault(altloc, set())
189+
rploc.add((resname, resnum))
190+
191+
prev_altloc = altloc
192+
prev_resnum = resnum
193+
prev_resname = resname
194+
195+
elif line.startswith(terminators):
196+
# before flushing the terminator line
197+
# we should flush the previous altloc group
198+
if altloc_lines:
199+
if partial_altloc(altloc_lines):
200+
flush_func = flush_func_single_residues
201+
else:
202+
flush_func = flush_func_multi_residues
203+
for __line in flush_func(selloc=selloc, altloc_lines=altloc_lines, res_per_loc=res_per_loc):
204+
yield __line
205+
206+
prev_altloc = ''
207+
prev_resname = ''
208+
prev_resnum = ''
209+
210+
yield line # the terminator line
170211

171-
if lineno in anisou_lines:
172-
anisou_lineno = anisou_lines[lineno]
173-
line = atom_data[anisou_lineno]
174-
atom_data[anisou_lineno] = line[:16] + ' ' + line[17:]
175-
ignored_discard(anisou_lineno)
212+
else:
213+
prev_altloc = ''
214+
prev_resname = ''
215+
prev_resnum = ''
216+
yield line
176217

177-
ignored_update(p[0] for p in prop_list[1:])
218+
# end of for loop
219+
# flush altloc residues in case the last residue was an altloc
220+
if altloc_lines:
178221

179-
# Now yield
180-
for lineno, line in enumerate(atom_data):
181-
if lineno in ignored:
182-
continue
222+
if partial_altloc(altloc_lines):
223+
flush_func = flush_func_single_residues
224+
else:
225+
flush_func = flush_func_multi_residues
226+
227+
for __line in flush_func(selloc=selloc, altloc_lines=altloc_lines, res_per_loc=res_per_loc):
228+
yield __line
229+
230+
231+
def is_another_altloc_group(
232+
altloc,
233+
prev_altloc,
234+
resnum,
235+
prev_resnum,
236+
resname,
237+
prev_resname,
238+
altloc_lines,
239+
rploc,
240+
):
241+
"""Detect if current line because to another altloc group."""
242+
a0 = prev_altloc
243+
a1 = altloc
244+
ra0 = prev_resname
245+
ra1 = resname
246+
ru0 = prev_resnum
247+
ru1 = resnum
248+
rl = altloc_lines
249+
rv = list(rploc.values())
250+
251+
is_another = (
252+
all((a0, ra0, ru0)) and (
253+
(a0 != a1 and a1 == ' ' and ru1 > ru0)
254+
or (a0 == ' ' and a1 == ' ' and (ru1 != ru0 or ra1 != ra0))
255+
or (
256+
a0 == a1
257+
and a0 != ' '
258+
and a1 in rl
259+
and ru1 > ru0
260+
and len(rl) > 1
261+
and all(len(v) == len(rv[0]) for v in rv[1:])
262+
)
263+
)
264+
)
265+
266+
return is_another
267+
268+
269+
def flush_resloc(selloc, altloc_lines, res_per_loc):
270+
"""Flush the captured altloc lines."""
271+
# only the selected altloc is yieled
272+
if selloc in altloc_lines:
273+
for line2flush in altloc_lines[selloc]:
274+
yield line2flush[:16] + ' ' + line2flush[17:]
275+
276+
# the altloc group does not contain the selected altloc
277+
# therefore, all members should be yielded
278+
else:
279+
for key, lines2flush in altloc_lines.items():
280+
for line2flush in lines2flush:
281+
yield line2flush
282+
283+
# clears the altloc group dictionary. Ready for the next one!
284+
altloc_lines.clear()
285+
res_per_loc.clear()
286+
287+
288+
def flush_resloc_occ(altloc_lines, res_per_loc, **kw):
289+
"""Flush the captured altloc lines by highest occupancy."""
290+
# only the selected altloc is yieled
291+
highest = 0.00
292+
altloc = ' '
293+
294+
# detects which altloc identifier has the highest occupancy
295+
for key, lines2flush in altloc_lines.items():
296+
# we check only the first line because all atoms in one identifier
297+
# should have the same occupancy value
298+
occ = float(lines2flush[0][54:60])
299+
if occ > highest:
300+
altloc = key
301+
highest = occ
302+
303+
for line2flush in altloc_lines[altloc]:
304+
yield line2flush[:16] + ' ' + line2flush[17:]
305+
306+
# clears the altloc group dictionary. Ready for the next one!
307+
altloc_lines.clear()
308+
res_per_loc.clear()
309+
310+
311+
def flush_resloc_id_same_residue(selloc, altloc_lines, res_per_loc):
312+
"""Flush altloc if altloc are atoms in the same residue - by ID."""
313+
# places all lines in a single list
314+
all_lines = []
315+
for altloc, lines in altloc_lines.items():
316+
all_lines.extend(lines)
317+
318+
# organize by atoms
319+
atoms = {}
320+
for line in all_lines:
321+
atom_number = int(line[6:11])
322+
atom = line[12:16]
323+
alist = atoms.setdefault((atom_number, atom), [])
324+
alist.append(line)
325+
326+
sorted_atoms = sorted(list(atoms.items()), key=lambda x: x[0][0])
327+
328+
to_yield = []
329+
for atom, lines in sorted_atoms:
330+
for line in lines:
331+
if line[16] == selloc:
332+
to_yield.append(line)
333+
334+
if to_yield:
335+
for line in to_yield:
336+
yield line[:16] + ' ' + line[17:]
337+
else:
338+
for line in lines:
339+
yield line
183340

184-
yield line
341+
altloc_lines.clear()
342+
res_per_loc.clear()
185343

186344

187-
def select_by_altloc(fhandle, selloc):
188-
"""
189-
Pick one altloc when atoms have more than one.
345+
def flush_resloc_occ_same_residue(altloc_lines, res_per_loc, **kw):
346+
"""Flush altloc if altloc are atoms in the same residue - by occ."""
347+
# places all lines in a single list
348+
all_lines = []
349+
for altloc, lines in altloc_lines.items():
350+
all_lines.extend(lines)
190351

191-
If the specified altloc (selloc) is not present for this particular
192-
atom, outputs all altlocs. For instance, if atom X has altlocs A and
193-
B but the user picked C, we return A and B anyway. If atom Y has
194-
altlocs A, B, and C, then we only return C.
352+
# organize by atoms
353+
atoms = {}
354+
for line in all_lines:
355+
atom_number = int(line[6:11])
356+
atom = line[12:16]
357+
alist = atoms.setdefault((atom_number, atom), [])
358+
alist.append(line)
195359

196-
This function is a generator.
360+
sorted_atoms = sorted(list(atoms.items()), key=lambda x: x[0][0])
197361

198-
Parameters
199-
----------
200-
fhandle : an iterable giving the PDB file line-by-line.
362+
A = {
363+
'ATOM': 1,
364+
'HETA': 1,
365+
'ANIS': 0,
366+
}
201367

202-
Yields
203-
------
204-
str (line-by-line)
205-
The PDB file with altlocs according to selection.
206-
"""
207-
# We have to iterate multiple times
208-
atom_prop = {}
209-
atom_prop_setd = atom_prop.setdefault
210-
atom_data = []
211-
atom_data_append = atom_data.append
368+
for atom, lines in sorted_atoms:
369+
lines.sort(key=lambda x: (A[x[:4]], float(x[54:60])), reverse=True)
370+
yield lines[0][:16] + ' ' + lines[0][17:]
371+
if lines[1:] and lines[1].startswith('ANISOU'):
372+
yield lines[1][:16] + ' ' + lines[1][17:]
212373

213-
# Iterate over file and store atom_uid
214-
records = ('ATOM', 'HETATM', 'ANISOU')
215-
editable = set()
216-
editable_add = editable.add
217-
for lineno, line in enumerate(fhandle):
374+
altloc_lines.clear()
375+
res_per_loc.clear()
218376

219-
atom_data_append(line)
220377

221-
if line.startswith(records):
222-
# Sometimes altlocs are used between different residue names.
223-
# See 3u7t (residue 22 of chain A). So we ignore the resname below.
224-
atom_uid = (line[12:16], line[20:26])
378+
def all_same_residue(altloc_lines):
379+
"""Assert all lines are from same residue."""
380+
residues = set()
381+
for key, val in altloc_lines.items():
382+
for line in val:
383+
resname = line[17:20]
384+
resnum = line[22:26].strip()
385+
residues.add((resname, resnum))
225386

226-
altloc = line[16]
227-
atom_prop_l = atom_prop_setd(atom_uid, [])
228-
atom_prop_l.append((altloc, lineno))
229-
230-
if altloc == selloc: # flag as editable
231-
editable_add(lineno)
232-
233-
# Reduce editable indexes to atom_uid entries
234-
editable = {
235-
(atom_data[i][12:16], atom_data[i][20:26]) for i in editable
236-
}
237-
238-
# Now define lines to ignore in the output
239-
ignored = set()
240-
for atom_uid in editable:
241-
for altloc, lineno in atom_prop[atom_uid]:
242-
if altloc != selloc:
243-
ignored.add(lineno)
244-
else:
245-
# Edit altloc field
246-
line = atom_data[lineno]
247-
atom_data[lineno] = line[:16] + ' ' + line[17:]
248-
249-
# Iterate again and yield the correct lines.
250-
for lineno, line in enumerate(atom_data):
251-
if lineno in ignored:
252-
continue
253-
254-
yield line
387+
return len(residues) == 1
388+
389+
390+
def partial_altloc(altloc_lines):
391+
"""Detect if the altloc positions are atoms in a single residue."""
392+
return ' ' in altloc_lines and all_same_residue(altloc_lines)
255393

256394

257395
def run(fhandle, option=None):

0 commit comments

Comments
 (0)