Skip to content
This repository was archived by the owner on Jan 9, 2025. It is now read-only.

Commit 4f2a76b

Browse files
authored
Subset Sum Improvements (computational-metabolomics#10)
* Implement dynamic programming approach to subset sum * Subset sum only used at integer mass * Build function forced to use integer level as the first subset sum stage
1 parent 2f2351b commit 4f2a76b

File tree

8 files changed

+226
-181
lines changed

8 files changed

+226
-181
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ ENV/
108108

109109
# ignore changes to scripts for testing
110110
scripts/*
111+
scripts/results/*
111112

112113
# ignore lib files for testing
113114
*/libgcc_s_dw2-1.dll

metaboverse/build_structures.py

Lines changed: 90 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -32,36 +32,86 @@
3232
from .databases import SubstructureDb
3333

3434

35-
def subset_sum(l, mass, toll=0.001):
35+
def find_path(l, dp, n, mass, max_subset_length, path=[]):
3636
"""
37-
Recursive subset sum algorithm for identifying sets of substructures that make up a given mass; the first step
38-
of building molecules from substructures.
37+
Recursive solution for backtracking through the dynamic programming boolean matrix. All possible subsets are found
3938
4039
:param l: A list of masses from which to identify subsets.
4140
4241
:param mass: The target mass of the sum of the substructures.
4342
44-
:param toll: The allowable deviation of the sum of subsets from the target mass.
43+
:param dp: The dynamic programming boolean matrix.
44+
45+
:param n: The size of l.
46+
47+
:param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to
48+
terminate early in many cases, significantly improving runtime.
49+
50+
:param path: List for keeping track of the current subset.
4551
4652
:return: Generates of lists containing the masses of valid subsets.
4753
"""
4854

49-
if mass < -toll:
55+
# base case - the path has generated a correct solution
56+
if mass == 0:
57+
yield sorted(path)
5058
return
5159

52-
elif len(l) == 0:
53-
if -toll <= mass <= toll:
54-
yield []
60+
# stop running when we overshoot the mass
61+
elif mass < 0:
5562
return
5663

57-
elif abs(sum(l) - mass) <= toll:
58-
yield l
59-
return
64+
# can we sum up to the target value using the remaining masses? recursive call
65+
elif dp[n][mass]:
66+
yield from find_path(l, dp, n-1, mass, max_subset_length, path)
67+
68+
if len(path) < max_subset_length:
69+
path.append(l[n-1])
70+
71+
yield from find_path(l, dp, n-1, mass - l[n-1], max_subset_length, path)
72+
path.pop()
73+
74+
75+
def subset_sum(l, mass, max_subset_length=3):
76+
"""
77+
Dynamic programming implementation of subset sum. Note that, whilst this algorithm is pseudo-polynomial, the
78+
backtracking algorithm for obtaining all possible subsets has exponential complexity and so remains unsuitable
79+
for large input values. This does, however, tend to perform a lot better than non-dp implementations, as we're
80+
no longer doing sums multiple times and we've cut down the operations performed during the exponential portion of
81+
the method.
82+
83+
:param l: A list of masses from which to identify subsets.
84+
85+
:param mass: The target mass of the sum of the substructures.
86+
87+
:param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to
88+
terminate early in many cases, significantly improving runtime.
89+
90+
:return: Generates of lists containing the masses of valid subsets.
91+
"""
92+
n = len(l)
93+
94+
# initialise dynamic programming array
95+
dp = numpy.ndarray([n+1, mass+1], bool)
96+
97+
# subsets can always equal 0
98+
for i in range(n+1):
99+
dp[i][0] = True
60100

61-
for subset in subset_sum(l[1:], mass):
62-
yield subset
63-
for subset in subset_sum(l[1:], mass - l[0]):
64-
yield [l[0]] + subset
101+
# empty subsets do not have non-zero sums
102+
for i in range(mass):
103+
dp[0][i+1] = False
104+
105+
# fill in the remaining boolean matrix
106+
for i in range(n):
107+
for j in range(mass+1):
108+
if j >= l[i]:
109+
dp[i+1][j] = dp[i][j] or dp[i][j-l[i]]
110+
else:
111+
dp[i+1][j] = dp[i][j]
112+
113+
# backtrack through the matrix recursively to obtain all solutions
114+
return find_path(l, dp, n, mass, max_subset_length)
65115

66116

67117
def combine_ecs(ss2_grp, db, table_name, accuracy, ppm=None):
@@ -79,13 +129,6 @@ def combine_ecs(ss2_grp, db, table_name, accuracy, ppm=None):
79129
:param accuracy: To which decimal places of accuracy results are to be limited to.
80130
81131
* **1** Integer level
82-
83-
* **0_1** One decimal place
84-
85-
* **0_01** Two decimal places
86-
87-
* **0_001** Three decimal places
88-
89132
* **0_0001** Four decimal places
90133
91134
:param ppm: The allowable error of the query (in parts per million). If unspecified, only exact matches are
@@ -283,13 +326,6 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
283326
intial subset_sum pass; in the second stage, the maximum accuracy (d.p.) is always used.
284327
285328
* **1** Integer level
286-
287-
* **0_1** One decimal place
288-
289-
* **0_01** Two decimal places
290-
291-
* **0_001** Three decimal places
292-
293329
* **0_0001** Four decimal places
294330
295331
:param max_n_substructures: The maximum number of substructures to be used for building molecules.
@@ -328,7 +364,7 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
328364
db = SubstructureDb(path_db, path_pkls, path_db_k_graphs)
329365

330366
if table_name is None: # generate "temp" table containing only substructures in parameter space
331-
table_name = gen_subs_table(db, heavy_atoms, max_valence, max_atoms_available)
367+
table_name = gen_subs_table(db, heavy_atoms, max_valence, max_atoms_available, round(exact_mass))
332368

333369
if fragment_mass is None: # standard build method
334370
exact_mass__1 = round(exact_mass)
@@ -353,14 +389,11 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
353389
multiprocessing.freeze_support()
354390

355391
# select groups of masses at low mass resolution
356-
mass_values = db.select_mass_values(str(accuracy), [], table_name)
357-
358-
try:
359-
subsets = list(subset_sum(mass_values, exact_mass__1))
360-
except RecursionError: # TODO: Handle subset_sum recursion issue
361-
if debug:
362-
print("Building mol failed due to subset_sum recursion error")
363-
return
392+
mass_values = [m for m in db.select_mass_values("1", [], table_name) if m <= exact_mass__1]
393+
if len(mass_values) == 0:
394+
subsets = []
395+
else:
396+
subsets = list(subset_sum(mass_values, exact_mass__1, max_n_substructures))
364397

365398
configs_iso = db.k_configs()
366399
out = open(fn_out, out_mode)
@@ -372,16 +405,23 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
372405

373406
lls = []
374407
for ss_grp in subsets:
375-
if len(ss_grp) > max_n_substructures:
408+
if len(ss_grp) > max_n_substructures or len(ss_grp) == 0:
376409
continue
377410

378411
# refine groups of masses to 4dp mass resolution
379412
mass_values_r2 = db.select_mass_values("0_0001", ss_grp, table_name)
380-
subsets_r2 = list(subset_sum(mass_values_r2, exact_mass__0_0001, tolerance))
413+
subsets_r2 = []
414+
415+
# use combinations to get second group of masses instead of subset sum - subset sum is integer mass only
416+
for mass_combo in itertools.product(*mass_values_r2):
417+
if abs(sum(mass_combo) - exact_mass__0_0001) <= tolerance:
418+
subsets_r2.append(mass_combo)
419+
420+
if len(subsets_r2) == 0:
421+
continue
381422

382423
if fragment_mass is not None: # add fragments mass to to loss group
383-
for i, subset in enumerate(subsets_r2):
384-
subsets_r2[i] = [round(exact_mass - loss, 4)] + subset
424+
subsets_r2 = [subset + (round(exact_mass - loss, 4),) for subset in subsets_r2]
385425

386426
if debug:
387427
print("""Second round (mass: {}) - Values: {} - Correct Sums: {}
@@ -401,12 +441,15 @@ def build(mc, exact_mass, fn_out, heavy_atoms, max_valence, accuracy, max_atoms_
401441
db.close()
402442

403443

404-
def gen_subs_table(db, heavy_atoms, max_valence, max_atoms_available, table_name="subset_substructures"):
444+
def gen_subs_table(db, heavy_atoms, max_valence, max_atoms_available, max_mass, table_name="subset_substructures"):
405445
"""
406446
Generate a temporary secondary substructure table restricted by a set of parameters. Generated as an initial step
407447
in :py:meth:`metaboverse.build_structures.build` in order to limit the processing overhead as a result of
408448
repeatedly querying the SQLite substructure database.
409449
450+
:param max_mass: The maximum allowed mass of substructures in the temporary table; there is no point considering
451+
substructures with greater mass than the target mol.
452+
410453
:param db: Connection to a :py:meth:`metaboverse.databases.SubstructureDb` from which to extract substructures.
411454
412455
:param heavy_atoms: List of integers used to limit which substructures are transferred into the temporary table.
@@ -428,11 +471,13 @@ def gen_subs_table(db, heavy_atoms, max_valence, max_atoms_available, table_name
428471
SELECT * FROM substructures WHERE
429472
heavy_atoms IN ({}) AND
430473
atoms_available <= {} AND
431-
valence <= {}
474+
valence <= {} AND
475+
exact_mass__1 < {}
432476
""".format(table_name,
433477
",".join(map(str, heavy_atoms)),
434478
max_atoms_available,
435-
max_valence,))
479+
max_valence,
480+
max_mass,))
436481

437482
db.create_indexes(table=table_name, selection="gen_subs_table")
438483

0 commit comments

Comments
 (0)