3939data to another. They are based on old FORTRAN77 code that was taking too much
4040effort to maintain and compile. RIP.
4141"""
42-
4342import operator
4443import os
4544import sys
@@ -113,8 +112,21 @@ def check_input(args):
113112
114113
115114def select_by_occupancy (fhandle ):
115+ return select_altloc (fhandle , selloc = None , byocc = True )
116+
117+
118+ def select_by_altloc (fhandle , selloc ):
119+ return select_altloc (fhandle , selloc , byocc = False )
120+
121+
122+ def select_altloc (fhandle , selloc = None , byocc = False ):
116123 """
117- Pick the altloc with the highest occupancy.
124+ Pick one altloc when atoms have more than one.
125+
126+ If the specified altloc (selloc) is not present for this particular
127+ atom, outputs all altlocs. For instance, if atom X has altlocs A and
128+ B but the user picked C, we return A and B anyway. If atom Y has
129+ altlocs A, B, and C, then we only return C.
118130
119131 This function is a generator.
120132
@@ -125,133 +137,259 @@ def select_by_occupancy(fhandle):
125137 Yields
126138 ------
127139 str (line-by-line)
128- The PDB file with altlocs of highest occupancy only .
140+ The PDB file with altlocs according to selection .
129141 """
130- atom_prop = {}
131- atom_prop_setd = atom_prop .setdefault
132- atom_data = []
133- atom_data_append = atom_data .append
134- anisou_lines = {} # map atom_uid to lineno
135- ignored = set ()
136- ignored_add = ignored .add
137- ignored_discard = ignored .discard
138- ignored_update = ignored .update
139-
140- # Iterate over file and store atom_uid
141- records = ('ATOM' , 'HETATM' , 'ANISOU' )
142- for lineno , line in enumerate (fhandle ):
142+ if selloc is None and not byocc :
143+ raise ValueError ('Provide either `selloc` or `byocc`.' )
143144
144- atom_data_append (line )
145+ altloc_lines = {} # dict to capture the lines from a altloc group
146+ res_per_loc = {} # dict to capture the residues per altloc group
145147
146- if line .startswith (records ):
147- # Sometimes altlocs are used between different residue names.
148- # See 3u7t (residue 22 of chain A). So we ignore the resname below.
149- atom_uid = (line [12 :16 ], line [20 :26 ])
148+ prev_altloc = ''
149+ prev_resname = ''
150+ prev_resnum = ''
150151
151- # ANISOU records do not have occupancy values.
152- # To keep things simple, we map ANISOU to ATOM/HETATM records
153- if line .startswith ('ANISOU' ):
154- anisou_lines [lineno - 1 ] = lineno
155- ignored_add (lineno ) # we will fix this below
156- else :
157- occ = float (line [54 :60 ])
158- atom_prop_l = atom_prop_setd (atom_uid , [])
159- atom_prop_l .append ((lineno , occ ))
152+ flush_func_multi_residues = flush_resloc_occ if byocc else flush_resloc
153+ flush_func_single_residues = \
154+ flush_resloc_occ_same_residue if byocc else flush_resloc_id_same_residue
160155
161- # Iterate and pick highest occupancy for each atom.
162- for atom_uid , prop_list in atom_prop .items ():
163- prop_list .sort (key = operator .itemgetter (1 ), reverse = True )
156+ records = ('ATOM' , 'HETATM' , 'ANISOU' )
157+ terminators = ('TER' , 'END' , 'CONECT' , 'END' , 'ENDMDL' )
164158
165- lineno = prop_list [ 0 ][ 0 ]
159+ for line in fhandle :
166160
167- # Edit altloc field(s)
168- line = atom_data [lineno ]
169- atom_data [lineno ] = line [:16 ] + ' ' + line [17 :]
161+ if line .startswith (records ):
162+ # captures the relevant parameters
163+ altloc = line [16 ]
164+ resname = line [17 :20 ]
165+ resnum = line [22 :26 ].strip ()
166+
167+ if is_another_altloc_group (
168+ altloc , prev_altloc , resnum , prev_resnum ,
169+ resname , prev_resname , altloc_lines , res_per_loc ):
170+ # if we see the altloc group has changed, we should flush
171+ # the lines observed for the previous altloc group
172+
173+ # uses for loop instead of "yield from" to maintain compatibility
174+ # with older python version
175+ if partial_altloc (altloc_lines ):
176+ flush_func = flush_func_single_residues
177+ else :
178+ flush_func = flush_func_multi_residues
179+
180+ for __line in flush_func (selloc = selloc , altloc_lines = altloc_lines , res_per_loc = res_per_loc ):
181+ yield __line
182+
183+ # saves the line per altloc identifier
184+ current_loc = altloc_lines .setdefault (altloc , [])
185+ current_loc .append (line )
186+
187+ # registers which residues are seen for each identifier
188+ rploc = res_per_loc .setdefault (altloc , set ())
189+ rploc .add ((resname , resnum ))
190+
191+ prev_altloc = altloc
192+ prev_resnum = resnum
193+ prev_resname = resname
194+
195+ elif line .startswith (terminators ):
196+ # before flushing the terminator line
197+ # we should flush the previous altloc group
198+ if altloc_lines :
199+ if partial_altloc (altloc_lines ):
200+ flush_func = flush_func_single_residues
201+ else :
202+ flush_func = flush_func_multi_residues
203+ for __line in flush_func (selloc = selloc , altloc_lines = altloc_lines , res_per_loc = res_per_loc ):
204+ yield __line
205+
206+ prev_altloc = ''
207+ prev_resname = ''
208+ prev_resnum = ''
209+
210+ yield line # the terminator line
170211
171- if lineno in anisou_lines :
172- anisou_lineno = anisou_lines [ lineno ]
173- line = atom_data [ anisou_lineno ]
174- atom_data [ anisou_lineno ] = line [: 16 ] + ' ' + line [ 17 :]
175- ignored_discard ( anisou_lineno )
212+ else :
213+ prev_altloc = ''
214+ prev_resname = ''
215+ prev_resnum = ''
216+ yield line
176217
177- ignored_update (p [0 ] for p in prop_list [1 :])
218+ # end of for loop
219+ # flush altloc residues in case the last residue was an altloc
220+ if altloc_lines :
178221
179- # Now yield
180- for lineno , line in enumerate (atom_data ):
181- if lineno in ignored :
182- continue
222+ if partial_altloc (altloc_lines ):
223+ flush_func = flush_func_single_residues
224+ else :
225+ flush_func = flush_func_multi_residues
226+
227+ for __line in flush_func (selloc = selloc , altloc_lines = altloc_lines , res_per_loc = res_per_loc ):
228+ yield __line
229+
230+
231+ def is_another_altloc_group (
232+ altloc ,
233+ prev_altloc ,
234+ resnum ,
235+ prev_resnum ,
236+ resname ,
237+ prev_resname ,
238+ altloc_lines ,
239+ rploc ,
240+ ):
241+ """Detect if current line because to another altloc group."""
242+ a0 = prev_altloc
243+ a1 = altloc
244+ ra0 = prev_resname
245+ ra1 = resname
246+ ru0 = prev_resnum
247+ ru1 = resnum
248+ rl = altloc_lines
249+ rv = list (rploc .values ())
250+
251+ is_another = (
252+ all ((a0 , ra0 , ru0 )) and (
253+ (a0 != a1 and a1 == ' ' and ru1 > ru0 )
254+ or (a0 == ' ' and a1 == ' ' and (ru1 != ru0 or ra1 != ra0 ))
255+ or (
256+ a0 == a1
257+ and a0 != ' '
258+ and a1 in rl
259+ and ru1 > ru0
260+ and len (rl ) > 1
261+ and all (len (v ) == len (rv [0 ]) for v in rv [1 :])
262+ )
263+ )
264+ )
265+
266+ return is_another
267+
268+
269+ def flush_resloc (selloc , altloc_lines , res_per_loc ):
270+ """Flush the captured altloc lines."""
271+ # only the selected altloc is yieled
272+ if selloc in altloc_lines :
273+ for line2flush in altloc_lines [selloc ]:
274+ yield line2flush [:16 ] + ' ' + line2flush [17 :]
275+
276+ # the altloc group does not contain the selected altloc
277+ # therefore, all members should be yielded
278+ else :
279+ for key , lines2flush in altloc_lines .items ():
280+ for line2flush in lines2flush :
281+ yield line2flush
282+
283+ # clears the altloc group dictionary. Ready for the next one!
284+ altloc_lines .clear ()
285+ res_per_loc .clear ()
286+
287+
288+ def flush_resloc_occ (altloc_lines , res_per_loc , ** kw ):
289+ """Flush the captured altloc lines by highest occupancy."""
290+ # only the selected altloc is yieled
291+ highest = 0.00
292+ altloc = ' '
293+
294+ # detects which altloc identifier has the highest occupancy
295+ for key , lines2flush in altloc_lines .items ():
296+ # we check only the first line because all atoms in one identifier
297+ # should have the same occupancy value
298+ occ = float (lines2flush [0 ][54 :60 ])
299+ if occ > highest :
300+ altloc = key
301+ highest = occ
302+
303+ for line2flush in altloc_lines [altloc ]:
304+ yield line2flush [:16 ] + ' ' + line2flush [17 :]
305+
306+ # clears the altloc group dictionary. Ready for the next one!
307+ altloc_lines .clear ()
308+ res_per_loc .clear ()
309+
310+
311+ def flush_resloc_id_same_residue (selloc , altloc_lines , res_per_loc ):
312+ """Flush altloc if altloc are atoms in the same residue - by ID."""
313+ # places all lines in a single list
314+ all_lines = []
315+ for altloc , lines in altloc_lines .items ():
316+ all_lines .extend (lines )
317+
318+ # organize by atoms
319+ atoms = {}
320+ for line in all_lines :
321+ atom_number = int (line [6 :11 ])
322+ atom = line [12 :16 ]
323+ alist = atoms .setdefault ((atom_number , atom ), [])
324+ alist .append (line )
325+
326+ sorted_atoms = sorted (list (atoms .items ()), key = lambda x : x [0 ][0 ])
327+
328+ to_yield = []
329+ for atom , lines in sorted_atoms :
330+ for line in lines :
331+ if line [16 ] == selloc :
332+ to_yield .append (line )
333+
334+ if to_yield :
335+ for line in to_yield :
336+ yield line [:16 ] + ' ' + line [17 :]
337+ else :
338+ for line in lines :
339+ yield line
183340
184- yield line
341+ altloc_lines .clear ()
342+ res_per_loc .clear ()
185343
186344
187- def select_by_altloc (fhandle , selloc ):
188- """
189- Pick one altloc when atoms have more than one.
345+ def flush_resloc_occ_same_residue (altloc_lines , res_per_loc , ** kw ):
346+ """Flush altloc if altloc are atoms in the same residue - by occ."""
347+ # places all lines in a single list
348+ all_lines = []
349+ for altloc , lines in altloc_lines .items ():
350+ all_lines .extend (lines )
190351
191- If the specified altloc (selloc) is not present for this particular
192- atom, outputs all altlocs. For instance, if atom X has altlocs A and
193- B but the user picked C, we return A and B anyway. If atom Y has
194- altlocs A, B, and C, then we only return C.
352+ # organize by atoms
353+ atoms = {}
354+ for line in all_lines :
355+ atom_number = int (line [6 :11 ])
356+ atom = line [12 :16 ]
357+ alist = atoms .setdefault ((atom_number , atom ), [])
358+ alist .append (line )
195359
196- This function is a generator.
360+ sorted_atoms = sorted ( list ( atoms . items ()), key = lambda x : x [ 0 ][ 0 ])
197361
198- Parameters
199- ----------
200- fhandle : an iterable giving the PDB file line-by-line.
362+ A = {
363+ 'ATOM' : 1 ,
364+ 'HETA' : 1 ,
365+ 'ANIS' : 0 ,
366+ }
201367
202- Yields
203- ------
204- str (line-by-line)
205- The PDB file with altlocs according to selection.
206- """
207- # We have to iterate multiple times
208- atom_prop = {}
209- atom_prop_setd = atom_prop .setdefault
210- atom_data = []
211- atom_data_append = atom_data .append
368+ for atom , lines in sorted_atoms :
369+ lines .sort (key = lambda x : (A [x [:4 ]], float (x [54 :60 ])), reverse = True )
370+ yield lines [0 ][:16 ] + ' ' + lines [0 ][17 :]
371+ if lines [1 :] and lines [1 ].startswith ('ANISOU' ):
372+ yield lines [1 ][:16 ] + ' ' + lines [1 ][17 :]
212373
213- # Iterate over file and store atom_uid
214- records = ('ATOM' , 'HETATM' , 'ANISOU' )
215- editable = set ()
216- editable_add = editable .add
217- for lineno , line in enumerate (fhandle ):
374+ altloc_lines .clear ()
375+ res_per_loc .clear ()
218376
219- atom_data_append (line )
220377
221- if line .startswith (records ):
222- # Sometimes altlocs are used between different residue names.
223- # See 3u7t (residue 22 of chain A). So we ignore the resname below.
224- atom_uid = (line [12 :16 ], line [20 :26 ])
378+ def all_same_residue (altloc_lines ):
379+ """Assert all lines are from same residue."""
380+ residues = set ()
381+ for key , val in altloc_lines .items ():
382+ for line in val :
383+ resname = line [17 :20 ]
384+ resnum = line [22 :26 ].strip ()
385+ residues .add ((resname , resnum ))
225386
226- altloc = line [16 ]
227- atom_prop_l = atom_prop_setd (atom_uid , [])
228- atom_prop_l .append ((altloc , lineno ))
229-
230- if altloc == selloc : # flag as editable
231- editable_add (lineno )
232-
233- # Reduce editable indexes to atom_uid entries
234- editable = {
235- (atom_data [i ][12 :16 ], atom_data [i ][20 :26 ]) for i in editable
236- }
237-
238- # Now define lines to ignore in the output
239- ignored = set ()
240- for atom_uid in editable :
241- for altloc , lineno in atom_prop [atom_uid ]:
242- if altloc != selloc :
243- ignored .add (lineno )
244- else :
245- # Edit altloc field
246- line = atom_data [lineno ]
247- atom_data [lineno ] = line [:16 ] + ' ' + line [17 :]
248-
249- # Iterate again and yield the correct lines.
250- for lineno , line in enumerate (atom_data ):
251- if lineno in ignored :
252- continue
253-
254- yield line
387+ return len (residues ) == 1
388+
389+
390+ def partial_altloc (altloc_lines ):
391+ """Detect if the altloc positions are atoms in a single residue."""
392+ return ' ' in altloc_lines and all_same_residue (altloc_lines )
255393
256394
257395def run (fhandle , option = None ):
0 commit comments