@@ -185,8 +185,10 @@ def reindex_atoms(records):
185
185
186
186
if atom .GetIdx () in record ["degree_atoms" ]:
187
187
atoms_available .append (new_idx )
188
+
188
189
if atom .GetIdx () in record ["dummies" ]:
189
190
atoms_to_remove .append (new_idx )
191
+
190
192
if atom .GetIdx () in record ["bond_types" ]:
191
193
bond_types [new_idx ] = record ["bond_types" ][atom .GetIdx ()]
192
194
all_bond_types [i ] += record ["bond_types" ][atom .GetIdx ()]
@@ -289,7 +291,8 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
289
291
write_fragment_smis : bool = False ,
290
292
minimum_frequency : Union [int , None ] = None ,
291
293
hydrogenation_allowance : int = 2 ,
292
- yield_smi_dict : bool = True ) -> Union [Sequence [Dict [str , int ]], None ]:
294
+ yield_smi_dict : bool = True ,
295
+ isomeric_smiles : bool = False ) -> Union [Sequence [Dict [str , int ]], None ]:
293
296
"""
294
297
Generate molecules of a given mass using chemical substructures, connectivity graphs and spectral trees or
295
298
fragmentation spectra. Final structures and rankings are yielded by the function as a dictionary and/or written in
@@ -355,6 +358,8 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
355
358
:param yield_smi_dict: If True, for each input molecule the function yields a dictionary whose keys are SMILEs
356
359
strings and values are the number of `fragment_masses` by which the structure was generated. Else, returns None.
357
360
361
+ :param isomeric_smiles: If True, writes smiles with non-structural isomeric information.
362
+
358
363
:return: For each input molecule yields a dictionary whose keys are SMILEs strings for the generated
359
364
structures and values are the number of `fragment_masses` by which the structure was built (unless
360
365
`yield_smi_dict = False`).
@@ -413,7 +418,8 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
413
418
out_mode = "a" ,
414
419
table_name = table_name ,
415
420
ncpus = ncpus ,
416
- clean = False
421
+ clean = False ,
422
+ isomeric_smiles = isomeric_smiles
417
423
))
418
424
419
425
for smi in fragment_smis :
@@ -426,7 +432,7 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
426
432
[k + "," + str (i ) + "\n " for k , i in zip (structure_frequency .keys (), structure_frequency .values ())])
427
433
428
434
if yield_smi_dict :
429
- yield structure_frequency
435
+ yield { ms_id : structure_frequency }
430
436
431
437
db .close ()
432
438
@@ -442,7 +448,8 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
442
448
ncpus : Union [int , None ] = None ,
443
449
path_connectivity_db : Union [str , bytes , os .PathLike , None ] = None ,
444
450
minimum_frequency : Union [int , None ] = None ,
445
- yield_smi_set : bool = True
451
+ yield_smi_set : bool = True ,
452
+ isomeric_smiles : bool = False
446
453
) -> Union [Sequence [list ], None ]:
447
454
"""
448
455
Generate molecules of a given mass using chemical substructures and connectivity graphs. Can optionally take a
@@ -499,6 +506,8 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
499
506
500
507
:param yield_smi_set: If True, yields a set of unique SMILEs string for each input molecule, else returns None.
501
508
509
+ :param isomeric_smiles: If True, writes smiles with non-structural isomeric information.
510
+
502
511
:return: For each input molecule, yields a set of unique SMILEs strings (unless
503
512
`yield_smi_set = False`).
504
513
"""
@@ -545,7 +554,8 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
545
554
out_mode = "w" ,
546
555
table_name = table_name ,
547
556
ncpus = ncpus ,
548
- clean = False
557
+ clean = False ,
558
+ isomeric_smiles = isomeric_smiles
549
559
)
550
560
551
561
if yield_smi_set :
@@ -555,7 +565,7 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
555
565
556
566
557
567
def build (mf , exact_mass , max_n_substructures , path_smi_out , path_connectivity_db , path_substructure_db ,
558
- prescribed_mass , ppm , out_mode , ncpus , table_name , clean ):
568
+ prescribed_mass , ppm , out_mode , ncpus , table_name , clean , isomeric_smiles ):
559
569
"""
560
570
Core function for generating molecules of a given mass using substructures and connectivity graphs. Can optionally
561
571
take a "prescribed" fragment mass to further filter results; this can be used to incorporate MSn data. Final
@@ -600,23 +610,34 @@ def build(mf, exact_mass, max_n_substructures, path_smi_out, path_connectivity_d
600
610
601
611
:param clean: Whether to remove the temporary table of substructures, table_name`, after the method is complete.
602
612
613
+ :param isomeric_smiles: If True, writes smiles with non-structural isomeric information.
614
+
603
615
:return: Returns a set of unique SMILEs strings.
604
616
"""
605
617
606
618
db = SubstructureDb (path_substructure_db , path_connectivity_db )
607
619
tolerance = 0.001
608
620
609
- if prescribed_mass is None : # standard build method
621
+ if prescribed_mass is None :
610
622
exact_mass__1 = round (exact_mass )
611
- exact_mass__0_0001 = round (exact_mass , 4 )
612
623
613
624
else : # prescribed substructure build method
625
+ if ((prescribed_mass / 1000000 ) * ppm ) > 0.001 :
626
+ fragment_tolerance = round ((prescribed_mass / 1000000 ) * ppm , 4 )
627
+ else :
628
+ fragment_tolerance = 0.001
629
+
630
+ prescribed_subset = db .select_mass_values ("0_0001" , [round (prescribed_mass )], table_name )
631
+ prescribed_subset = [m for m in prescribed_subset [0 ] if abs (m - prescribed_mass ) <= fragment_tolerance ]
632
+
633
+ if len (prescribed_subset ) == 0 :
634
+ return set ()
635
+
614
636
loss = exact_mass - prescribed_mass
615
637
exact_mass__1 = round (loss )
616
- exact_mass__0_0001 = round (loss , 4 )
617
638
618
- if ((loss / 1000000 ) * ppm ) > 0.001 :
619
- tolerance = round ((loss / 1000000 ) * ppm , 4 )
639
+ if ((exact_mass / 1000000 ) * ppm ) > 0.001 :
640
+ tolerance = round ((exact_mass / 1000000 ) * ppm , 4 )
620
641
621
642
max_n_substructures -= 1 # we find sets of mols that add up to the loss, not the precursor mass
622
643
@@ -630,7 +651,7 @@ def build(mf, exact_mass, max_n_substructures, path_smi_out, path_connectivity_d
630
651
631
652
integer_subsets = list (subset_sum (integer_mass_values , exact_mass__1 , max_n_substructures ))
632
653
633
- configs_iso = db .k_configs (prescribed_mass is not None )
654
+ configs_iso = db .k_configs ()
634
655
if path_smi_out is not None :
635
656
smi_out = open (path_smi_out , out_mode )
636
657
@@ -641,25 +662,29 @@ def build(mf, exact_mass, max_n_substructures, path_smi_out, path_connectivity_d
641
662
642
663
# refine groups of masses to 4dp mass resolution
643
664
exact_mass_values = db .select_mass_values ("0_0001" , integer_subset , table_name )
644
- exact_subsets = []
665
+
666
+ if prescribed_mass is not None :
667
+ exact_mass_values = [prescribed_subset ] + exact_mass_values
645
668
646
669
# use combinations to get second group of masses instead of subset sum - subset sum is integer mass only
670
+ exact_subsets = []
647
671
for mass_combo in itertools .product (* exact_mass_values ):
648
- if abs (sum (mass_combo ) - exact_mass__0_0001 ) <= tolerance :
672
+ if abs (sum (mass_combo ) - exact_mass ) <= tolerance :
649
673
exact_subsets .append (mass_combo )
650
674
651
675
if len (exact_subsets ) == 0 :
652
676
continue
653
677
654
- if prescribed_mass is not None : # add fragments mass to to loss group
655
- exact_subsets = [subset + (round (exact_mass - loss , 4 ),) for subset in exact_subsets ]
656
-
657
678
# refines groups based on ecs and gets substructures from db (appends to substructure_subsets)
658
679
for exact_subset in exact_subsets :
659
680
substructure_subsets += build_from_subsets (exact_subset , mf = mf , table_name = table_name , db = db )
660
681
661
682
with multiprocessing .Pool (processes = ncpus ) as pool : # send sets of substructures for building
662
- smi_lists = pool .map (partial (substructure_combination_build , configs_iso = configs_iso ), substructure_subsets )
683
+ smi_lists = pool .map (
684
+ partial (substructure_combination_build , configs_iso = configs_iso ,
685
+ prescribed_structure = prescribed_mass , isomeric_smiles = isomeric_smiles ),
686
+ substructure_subsets
687
+ )
663
688
664
689
smis = set ([val for sublist in smi_lists for val in sublist ])
665
690
@@ -795,7 +820,7 @@ def build_from_subsets(exact_subset, mf, table_name, db):
795
820
return substructure_subsets
796
821
797
822
798
- def substructure_combination_build (substructure_subset , configs_iso ):
823
+ def substructure_combination_build (substructure_subset , configs_iso , prescribed_structure , isomeric_smiles ):
799
824
"""
800
825
Final stage for building molecules; takes a combination of substructures (substructure_combination) and builds them
801
826
according to graphs in the substructure database. May be run in parallel.
@@ -811,12 +836,25 @@ def substructure_combination_build(substructure_subset, configs_iso):
811
836
smis = []
812
837
813
838
for substructure_combination in itertools .product (* substructure_subset ):
839
+ substructure_combination [0 ]["fragment" ] = True
814
840
substructure_combination = sorted (substructure_combination , key = itemgetter ('atoms_available' , 'valence' ))
815
841
816
842
v_a = ()
817
- for d in substructure_combination :
843
+ if prescribed_structure is not None :
844
+ fragment_indexes = []
845
+ j = - 1
846
+ for i , d in enumerate (substructure_combination ):
818
847
v_a += (tuple (d ["degree_atoms" ].values ()),) # obtain valence configuration of the set of substructures
819
848
849
+ for atom_available in tuple (d ["degree_atoms" ].values ()):
850
+ j += 1
851
+
852
+ try :
853
+ if prescribed_structure is not None and d ["fragment" ]:
854
+ fragment_indexes .append (j )
855
+ except KeyError :
856
+ continue
857
+
820
858
if str (v_a ) not in configs_iso : # check mols "fit" together according to the connectivity database
821
859
continue
822
860
@@ -826,6 +864,16 @@ def substructure_combination_build(substructure_subset, configs_iso):
826
864
continue # check that bond types are compatible (imperfect check)
827
865
828
866
for edges in configs_iso [str (v_a )]: # build mols for each graph in connectivity db
867
+ if prescribed_structure is not None :
868
+ non_fragment_edges = False
869
+
870
+ for edge in edges : # check that edges only connect to fragment ion
871
+ if edge [0 ] not in fragment_indexes and edge [1 ] not in fragment_indexes :
872
+ non_fragment_edges = True
873
+
874
+ if non_fragment_edges :
875
+ continue
876
+
829
877
mol_e = add_bonds (mol_comb , edges , atoms_available , bond_types ) # add bonds between substructures
830
878
831
879
if mol_e is None :
@@ -842,7 +890,7 @@ def substructure_combination_build(substructure_subset, configs_iso):
842
890
continue
843
891
844
892
try : # append the canonical smiles of the final structure
845
- smis .append (Chem .MolToSmiles (mol_out ))
893
+ smis .append (Chem .MolToSmiles (mol_out , isomericSmiles = isomeric_smiles ))
846
894
except RuntimeError :
847
895
continue # bad bond type violation
848
896
0 commit comments