32
32
from .databases import SubstructureDb
33
33
34
34
35
- def subset_sum (l , mass , toll = 0.001 ):
35
+ def find_path (l , dp , n , mass , max_subset_length , path = [] ):
36
36
"""
37
- Recursive subset sum algorithm for identifying sets of substructures that make up a given mass; the first step
38
- of building molecules from substructures.
37
+ Recursive solution for backtracking through the dynamic programming boolean matrix. All possible subsets are found
39
38
40
39
:param l: A list of masses from which to identify subsets.
41
40
42
41
:param mass: The target mass of the sum of the substructures.
43
42
44
- :param toll: The allowable deviation of the sum of subsets from the target mass.
43
+ :param dp: The dynamic programming boolean matrix.
44
+
45
+ :param n: The size of l.
46
+
47
+ :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to
48
+ terminate early in many cases, significantly improving runtime.
49
+
50
+ :param path: List for keeping track of the current subset.
45
51
46
52
:return: Generates of lists containing the masses of valid subsets.
47
53
"""
48
54
49
- if mass < - toll :
55
+ # base case - the path has generated a correct solution
56
+ if mass == 0 :
57
+ yield sorted (path )
50
58
return
51
59
52
- elif len (l ) == 0 :
53
- if - toll <= mass <= toll :
54
- yield []
60
+ # stop running when we overshoot the mass
61
+ elif mass < 0 :
55
62
return
56
63
57
- elif abs (sum (l ) - mass ) <= toll :
58
- yield l
59
- return
64
+ # can we sum up to the target value using the remaining masses? recursive call
65
+ elif dp [n ][mass ]:
66
+ yield from find_path (l , dp , n - 1 , mass , max_subset_length , path )
67
+
68
+ if len (path ) < max_subset_length :
69
+ path .append (l [n - 1 ])
70
+
71
+ yield from find_path (l , dp , n - 1 , mass - l [n - 1 ], max_subset_length , path )
72
+ path .pop ()
73
+
74
+
75
+ def subset_sum (l , mass , max_subset_length = 3 ):
76
+ """
77
+ Dynamic programming implementation of subset sum. Note that, whilst this algorithm is pseudo-polynomial, the
78
+ backtracking algorithm for obtaining all possible subsets has exponential complexity and so remains unsuitable
79
+ for large input values. This does, however, tend to perform a lot better than non-dp implementations, as we're
80
+ no longer doing sums multiple times and we've cut down the operations performed during the exponential portion of
81
+ the method.
82
+
83
+ :param l: A list of masses from which to identify subsets.
84
+
85
+ :param mass: The target mass of the sum of the substructures.
86
+
87
+ :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to
88
+ terminate early in many cases, significantly improving runtime.
89
+
90
+ :return: Generates of lists containing the masses of valid subsets.
91
+ """
92
+ n = len (l )
93
+
94
+ # initialise dynamic programming array
95
+ dp = numpy .ndarray ([n + 1 , mass + 1 ], bool )
96
+
97
+ # subsets can always equal 0
98
+ for i in range (n + 1 ):
99
+ dp [i ][0 ] = True
60
100
61
- for subset in subset_sum (l [1 :], mass ):
62
- yield subset
63
- for subset in subset_sum (l [1 :], mass - l [0 ]):
64
- yield [l [0 ]] + subset
101
+ # empty subsets do not have non-zero sums
102
+ for i in range (mass ):
103
+ dp [0 ][i + 1 ] = False
104
+
105
+ # fill in the remaining boolean matrix
106
+ for i in range (n ):
107
+ for j in range (mass + 1 ):
108
+ if j >= l [i ]:
109
+ dp [i + 1 ][j ] = dp [i ][j ] or dp [i ][j - l [i ]]
110
+ else :
111
+ dp [i + 1 ][j ] = dp [i ][j ]
112
+
113
+ # backtrack through the matrix recursively to obtain all solutions
114
+ return find_path (l , dp , n , mass , max_subset_length )
65
115
66
116
67
117
def combine_ecs (ss2_grp , db , table_name , accuracy , ppm = None ):
@@ -79,13 +129,6 @@ def combine_ecs(ss2_grp, db, table_name, accuracy, ppm=None):
79
129
:param accuracy: To which decimal places of accuracy results are to be limited to.
80
130
81
131
* **1** Integer level
82
-
83
- * **0_1** One decimal place
84
-
85
- * **0_01** Two decimal places
86
-
87
- * **0_001** Three decimal places
88
-
89
132
* **0_0001** Four decimal places
90
133
91
134
:param ppm: The allowable error of the query (in parts per million). If unspecified, only exact matches are
@@ -283,13 +326,6 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
283
326
intial subset_sum pass; in the second stage, the maximum accuracy (d.p.) is always used.
284
327
285
328
* **1** Integer level
286
-
287
- * **0_1** One decimal place
288
-
289
- * **0_01** Two decimal places
290
-
291
- * **0_001** Three decimal places
292
-
293
329
* **0_0001** Four decimal places
294
330
295
331
:param max_n_substructures: The maximum number of substructures to be used for building molecules.
@@ -328,7 +364,7 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
328
364
db = SubstructureDb (path_db , path_pkls , path_db_k_graphs )
329
365
330
366
if table_name is None : # generate "temp" table containing only substructures in parameter space
331
- table_name = gen_subs_table (db , heavy_atoms , max_valence , max_atoms_available )
367
+ table_name = gen_subs_table (db , heavy_atoms , max_valence , max_atoms_available , round ( exact_mass ) )
332
368
333
369
if fragment_mass is None : # standard build method
334
370
exact_mass__1 = round (exact_mass )
@@ -353,14 +389,11 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
353
389
multiprocessing .freeze_support ()
354
390
355
391
# select groups of masses at low mass resolution
356
- mass_values = db .select_mass_values (str (accuracy ), [], table_name )
357
-
358
- try :
359
- subsets = list (subset_sum (mass_values , exact_mass__1 ))
360
- except RecursionError : # TODO: Handle subset_sum recursion issue
361
- if debug :
362
- print ("Building mol failed due to subset_sum recursion error" )
363
- return
392
+ mass_values = [m for m in db .select_mass_values ("1" , [], table_name ) if m <= exact_mass__1 ]
393
+ if len (mass_values ) == 0 :
394
+ subsets = []
395
+ else :
396
+ subsets = list (subset_sum (mass_values , exact_mass__1 , max_n_substructures ))
364
397
365
398
configs_iso = db .k_configs ()
366
399
out = open (fn_out , out_mode )
@@ -372,16 +405,23 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
372
405
373
406
lls = []
374
407
for ss_grp in subsets :
375
- if len (ss_grp ) > max_n_substructures :
408
+ if len (ss_grp ) > max_n_substructures or len ( ss_grp ) == 0 :
376
409
continue
377
410
378
411
# refine groups of masses to 4dp mass resolution
379
412
mass_values_r2 = db .select_mass_values ("0_0001" , ss_grp , table_name )
380
- subsets_r2 = list (subset_sum (mass_values_r2 , exact_mass__0_0001 , tolerance ))
413
+ subsets_r2 = []
414
+
415
+ # use combinations to get second group of masses instead of subset sum - subset sum is integer mass only
416
+ for mass_combo in itertools .product (* mass_values_r2 ):
417
+ if abs (sum (mass_combo ) - exact_mass__0_0001 ) <= tolerance :
418
+ subsets_r2 .append (mass_combo )
419
+
420
+ if len (subsets_r2 ) == 0 :
421
+ continue
381
422
382
423
if fragment_mass is not None : # add fragments mass to to loss group
383
- for i , subset in enumerate (subsets_r2 ):
384
- subsets_r2 [i ] = [round (exact_mass - loss , 4 )] + subset
424
+ subsets_r2 = [subset + (round (exact_mass - loss , 4 ),) for subset in subsets_r2 ]
385
425
386
426
if debug :
387
427
print ("""Second round (mass: {}) - Values: {} - Correct Sums: {}
@@ -401,12 +441,15 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
401
441
db .close ()
402
442
403
443
404
- def gen_subs_table (db , heavy_atoms , max_valence , max_atoms_available , table_name = "subset_substructures" ):
444
+ def gen_subs_table (db , heavy_atoms , max_valence , max_atoms_available , max_mass , table_name = "subset_substructures" ):
405
445
"""
406
446
Generate a temporary secondary substructure table restricted by a set of parameters. Generated as an initial step
407
447
in :py:meth:`metaboverse.build_structures.build` in order to limit the processing overhead as a result of
408
448
repeatedly querying the SQLite substructure database.
409
449
450
+ :param max_mass: The maximum allowed mass of substructures in the temporary table; there is no point considering
451
+ substructures with greater mass than the target mol.
452
+
410
453
:param db: Connection to a :py:meth:`metaboverse.databases.SubstructureDb` from which to extract substructures.
411
454
412
455
:param heavy_atoms: List of integers used to limit which substructures are transferred into the temporary table.
@@ -428,11 +471,13 @@ def gen_subs_table(db, heavy_atoms, max_valence, max_atoms_available, table_name
428
471
SELECT * FROM substructures WHERE
429
472
heavy_atoms IN ({}) AND
430
473
atoms_available <= {} AND
431
- valence <= {}
474
+ valence <= {} AND
475
+ exact_mass__1 < {}
432
476
""" .format (table_name ,
433
477
"," .join (map (str , heavy_atoms )),
434
478
max_atoms_available ,
435
- max_valence ,))
479
+ max_valence ,
480
+ max_mass ,))
436
481
437
482
db .create_indexes (table = table_name , selection = "gen_subs_table" )
438
483
0 commit comments