Skip to content
This repository was archived by the owner on Jan 9, 2025. It is now read-only.

Commit 13c0200

Browse files
authored
Limit connectivity to fragment edges only (computational-metabolomics#13)
* Return metabolite ID with msn annotation results * Rewrite MSn method for limiting connectivity to fragment edges only * Add option for use of smiles without non-structural isomeric information * Update tests
1 parent 44bf05c commit 13c0200

File tree

5 files changed

+139
-94
lines changed

5 files changed

+139
-94
lines changed

metaboblend/build_structures.py

Lines changed: 69 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,10 @@ def reindex_atoms(records):
185185

186186
if atom.GetIdx() in record["degree_atoms"]:
187187
atoms_available.append(new_idx)
188+
188189
if atom.GetIdx() in record["dummies"]:
189190
atoms_to_remove.append(new_idx)
191+
190192
if atom.GetIdx() in record["bond_types"]:
191193
bond_types[new_idx] = record["bond_types"][atom.GetIdx()]
192194
all_bond_types[i] += record["bond_types"][atom.GetIdx()]
@@ -289,7 +291,8 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
289291
write_fragment_smis: bool = False,
290292
minimum_frequency: Union[int, None] = None,
291293
hydrogenation_allowance: int = 2,
292-
yield_smi_dict: bool = True) -> Union[Sequence[Dict[str, int]], None]:
294+
yield_smi_dict: bool = True,
295+
isomeric_smiles: bool = False) -> Union[Sequence[Dict[str, int]], None]:
293296
"""
294297
Generate molecules of a given mass using chemical substructures, connectivity graphs and spectral trees or
295298
fragmentation spectra. Final structures and rankings are yielded by the function as a dictionary and/or written in
@@ -355,6 +358,8 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
355358
:param yield_smi_dict: If True, for each input molecule the function yields a dictionary whose keys are SMILEs
356359
strings and values are the number of `fragment_masses` by which the structure was generated. Else, returns None.
357360
361+
:param isomeric_smiles: If True, writes smiles with non-structural isomeric information.
362+
358363
:return: For each input molecule yields a dictionary whose keys are SMILEs strings for the generated
359364
structures and values are the number of `fragment_masses` by which the structure was built (unless
360365
`yield_smi_dict = False`).
@@ -413,7 +418,8 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
413418
out_mode="a",
414419
table_name=table_name,
415420
ncpus=ncpus,
416-
clean=False
421+
clean=False,
422+
isomeric_smiles=isomeric_smiles
417423
))
418424

419425
for smi in fragment_smis:
@@ -426,7 +432,7 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
426432
[k + "," + str(i) + "\n" for k, i in zip(structure_frequency.keys(), structure_frequency.values())])
427433

428434
if yield_smi_dict:
429-
yield structure_frequency
435+
yield {ms_id: structure_frequency}
430436

431437
db.close()
432438

@@ -442,7 +448,8 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
442448
ncpus: Union[int, None] = None,
443449
path_connectivity_db: Union[str, bytes, os.PathLike, None] = None,
444450
minimum_frequency: Union[int, None] = None,
445-
yield_smi_set: bool = True
451+
yield_smi_set: bool = True,
452+
isomeric_smiles: bool = False
446453
) -> Union[Sequence[list], None]:
447454
"""
448455
Generate molecules of a given mass using chemical substructures and connectivity graphs. Can optionally take a
@@ -499,6 +506,8 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
499506
500507
:param yield_smi_set: If True, yields a set of unique SMILEs string for each input molecule, else returns None.
501508
509+
:param isomeric_smiles: If True, writes smiles with non-structural isomeric information.
510+
502511
:return: For each input molecule, yields a set of unique SMILEs strings (unless
503512
`yield_smi_set = False`).
504513
"""
@@ -545,7 +554,8 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
545554
out_mode="w",
546555
table_name=table_name,
547556
ncpus=ncpus,
548-
clean=False
557+
clean=False,
558+
isomeric_smiles=isomeric_smiles
549559
)
550560

551561
if yield_smi_set:
@@ -555,7 +565,7 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
555565

556566

557567
def build(mf, exact_mass, max_n_substructures, path_smi_out, path_connectivity_db, path_substructure_db,
558-
prescribed_mass, ppm, out_mode, ncpus, table_name, clean):
568+
prescribed_mass, ppm, out_mode, ncpus, table_name, clean, isomeric_smiles):
559569
"""
560570
Core function for generating molecules of a given mass using substructures and connectivity graphs. Can optionally
561571
take a "prescribed" fragment mass to further filter results; this can be used to incorporate MSn data. Final
@@ -600,23 +610,34 @@ def build(mf, exact_mass, max_n_substructures, path_smi_out, path_connectivity_d
600610
601611
:param clean: Whether to remove the temporary table of substructures, table_name`, after the method is complete.
602612
613+
:param isomeric_smiles: If True, writes smiles with non-structural isomeric information.
614+
603615
:return: Returns a set of unique SMILEs strings.
604616
"""
605617

606618
db = SubstructureDb(path_substructure_db, path_connectivity_db)
607619
tolerance = 0.001
608620

609-
if prescribed_mass is None: # standard build method
621+
if prescribed_mass is None:
610622
exact_mass__1 = round(exact_mass)
611-
exact_mass__0_0001 = round(exact_mass, 4)
612623

613624
else: # prescribed substructure build method
625+
if ((prescribed_mass / 1000000) * ppm) > 0.001:
626+
fragment_tolerance = round((prescribed_mass / 1000000) * ppm, 4)
627+
else:
628+
fragment_tolerance = 0.001
629+
630+
prescribed_subset = db.select_mass_values("0_0001", [round(prescribed_mass)], table_name)
631+
prescribed_subset = [m for m in prescribed_subset[0] if abs(m - prescribed_mass) <= fragment_tolerance]
632+
633+
if len(prescribed_subset) == 0:
634+
return set()
635+
614636
loss = exact_mass - prescribed_mass
615637
exact_mass__1 = round(loss)
616-
exact_mass__0_0001 = round(loss, 4)
617638

618-
if ((loss / 1000000) * ppm) > 0.001:
619-
tolerance = round((loss / 1000000) * ppm, 4)
639+
if ((exact_mass / 1000000) * ppm) > 0.001:
640+
tolerance = round((exact_mass / 1000000) * ppm, 4)
620641

621642
max_n_substructures -= 1 # we find sets of mols that add up to the loss, not the precursor mass
622643

@@ -630,7 +651,7 @@ def build(mf, exact_mass, max_n_substructures, path_smi_out, path_connectivity_d
630651

631652
integer_subsets = list(subset_sum(integer_mass_values, exact_mass__1, max_n_substructures))
632653

633-
configs_iso = db.k_configs(prescribed_mass is not None)
654+
configs_iso = db.k_configs()
634655
if path_smi_out is not None:
635656
smi_out = open(path_smi_out, out_mode)
636657

@@ -641,25 +662,29 @@ def build(mf, exact_mass, max_n_substructures, path_smi_out, path_connectivity_d
641662

642663
# refine groups of masses to 4dp mass resolution
643664
exact_mass_values = db.select_mass_values("0_0001", integer_subset, table_name)
644-
exact_subsets = []
665+
666+
if prescribed_mass is not None:
667+
exact_mass_values = [prescribed_subset] + exact_mass_values
645668

646669
# use combinations to get second group of masses instead of subset sum - subset sum is integer mass only
670+
exact_subsets = []
647671
for mass_combo in itertools.product(*exact_mass_values):
648-
if abs(sum(mass_combo) - exact_mass__0_0001) <= tolerance:
672+
if abs(sum(mass_combo) - exact_mass) <= tolerance:
649673
exact_subsets.append(mass_combo)
650674

651675
if len(exact_subsets) == 0:
652676
continue
653677

654-
if prescribed_mass is not None: # add fragments mass to to loss group
655-
exact_subsets = [subset + (round(exact_mass - loss, 4),) for subset in exact_subsets]
656-
657678
# refines groups based on ecs and gets substructures from db (appends to substructure_subsets)
658679
for exact_subset in exact_subsets:
659680
substructure_subsets += build_from_subsets(exact_subset, mf=mf, table_name=table_name, db=db)
660681

661682
with multiprocessing.Pool(processes=ncpus) as pool: # send sets of substructures for building
662-
smi_lists = pool.map(partial(substructure_combination_build, configs_iso=configs_iso), substructure_subsets)
683+
smi_lists = pool.map(
684+
partial(substructure_combination_build, configs_iso=configs_iso,
685+
prescribed_structure=prescribed_mass, isomeric_smiles=isomeric_smiles),
686+
substructure_subsets
687+
)
663688

664689
smis = set([val for sublist in smi_lists for val in sublist])
665690

@@ -795,7 +820,7 @@ def build_from_subsets(exact_subset, mf, table_name, db):
795820
return substructure_subsets
796821

797822

798-
def substructure_combination_build(substructure_subset, configs_iso):
823+
def substructure_combination_build(substructure_subset, configs_iso, prescribed_structure, isomeric_smiles):
799824
"""
800825
Final stage for building molecules; takes a combination of substructures (substructure_combination) and builds them
801826
according to graphs in the substructure database. May be run in parallel.
@@ -811,12 +836,25 @@ def substructure_combination_build(substructure_subset, configs_iso):
811836
smis = []
812837

813838
for substructure_combination in itertools.product(*substructure_subset):
839+
substructure_combination[0]["fragment"] = True
814840
substructure_combination = sorted(substructure_combination, key=itemgetter('atoms_available', 'valence'))
815841

816842
v_a = ()
817-
for d in substructure_combination:
843+
if prescribed_structure is not None:
844+
fragment_indexes = []
845+
j = -1
846+
for i, d in enumerate(substructure_combination):
818847
v_a += (tuple(d["degree_atoms"].values()),) # obtain valence configuration of the set of substructures
819848

849+
for atom_available in tuple(d["degree_atoms"].values()):
850+
j += 1
851+
852+
try:
853+
if prescribed_structure is not None and d["fragment"]:
854+
fragment_indexes.append(j)
855+
except KeyError:
856+
continue
857+
820858
if str(v_a) not in configs_iso: # check mols "fit" together according to the connectivity database
821859
continue
822860

@@ -826,6 +864,16 @@ def substructure_combination_build(substructure_subset, configs_iso):
826864
continue # check that bond types are compatible (imperfect check)
827865

828866
for edges in configs_iso[str(v_a)]: # build mols for each graph in connectivity db
867+
if prescribed_structure is not None:
868+
non_fragment_edges = False
869+
870+
for edge in edges: # check that edges only connect to fragment ion
871+
if edge[0] not in fragment_indexes and edge[1] not in fragment_indexes:
872+
non_fragment_edges = True
873+
874+
if non_fragment_edges:
875+
continue
876+
829877
mol_e = add_bonds(mol_comb, edges, atoms_available, bond_types) # add bonds between substructures
830878

831879
if mol_e is None:
@@ -842,7 +890,7 @@ def substructure_combination_build(substructure_subset, configs_iso):
842890
continue
843891

844892
try: # append the canonical smiles of the final structure
845-
smis.append(Chem.MolToSmiles(mol_out))
893+
smis.append(Chem.MolToSmiles(mol_out, isomericSmiles=isomeric_smiles))
846894
except RuntimeError:
847895
continue # bad bond type violation
848896

0 commit comments

Comments
 (0)