Skip to content
This repository was archived by the owner on Jan 9, 2025. It is now read-only.

Commit 62063ee

Browse files
authored
* Return metabolite ID with msn annotation results * Rewrite MSn method for limiting connectivity to fragment edges only * Add option for use of smiles without non-structural isomeric information * Update tests * Correct return type hints * Implement SQLITE3 annotate_msn results database * Add msn option to ResultsDb * Re-structure results db tables * Update add_ms to add ms information to the queries table * Add function to insert entries into the results and substructures tables * Add function to get structure frequencies and/or SMILEs * Update user-facing functions for compatibility with ResultsDb * Remove text-based output and return substructure smiles from build functions in addition to final structures * Update build unit tests for ResultsDb * Add CSV output for build functions * Check if ResultsDb output matches reference files * Check ResultsDb CSV files line by line vs reference * Implement simple bond dissociation energy calculations * Add integer MS integer IDs and implement calculate_frequencies to more efficiently calculate structure frequencies * Re-format get_bond_enthalpies * Use integer IDs for results DB * Add retain_substructures option * Make filter_hmdbid_substructures a filtered version of the hmdbid_substructures table * Implement the substructure network generation algorithm in SQLite instead of networkx * Add get_substructure_network function to convert SQLite3 substructure network to a networkx graph * Implement get_single_edge to get substructure edge weights without the generation of a substructure network * Add integer substructure key * Update unit tests
1 parent 13c0200 commit 62063ee

9 files changed

+750
-390
lines changed

metaboblend/build_structures.py

Lines changed: 438 additions & 118 deletions
Large diffs are not rendered by default.

metaboblend/databases.py

Lines changed: 111 additions & 115 deletions
Large diffs are not rendered by default.

tests/test_build_structures.py

Lines changed: 107 additions & 123 deletions
Large diffs are not rendered by default.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
ms_id,exact_mass,C,H,N,O,P,S,ppm,ha_min,ha_max,max_atoms_available,max_degree,max_n_substructures,hydrogenation_allowance,isomeric_smiles
2+
0,HMDB0000073,153.078979,8,11,1,2,0,0,5,,,2,6,3,2,1
3+
1,HMDB0000122,180.06339,6,12,0,6,0,0,5,,,2,6,3,2,1
4+
2,HMDB0000158,181.073894,9,11,1,3,0,0,5,,,2,6,3,2,1
5+
3,HMDB0000186,342.116215,12,22,0,11,0,0,5,,,2,6,3,2,1
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
ms_id,smiles,frequency,exact_mass,C,H,N,O,P,S
2+
0,NCCc1cc(O)cc(O)c1,1
3+
0,NCCc1cc(O)ccc1O,1
4+
0,NCCc1ccc(O)c(O)c1,3
5+
1,OC1C(O)[C@H](O)[C@@H](O)[C@H](O)[C@H]1O,1
6+
1,OC1[C@H](O)C(O)[C@H](O)[C@@H](O)[C@@H]1O,1
7+
1,OC1[C@H](O)[C@@H](O)[C@@H](O)[C@H](O)[C@H]1O,1
8+
1,OC1[C@H](O)[C@@H](O)[C@H](O)[C@@H](O)[C@@H]1O,1
9+
1,OC1[C@H](O)[C@H](O)[C@@H](O)[C@H](O)[C@H]1O,1
10+
1,OC1[C@H](O)[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O,1
11+
1,OC[C@H]1OC(O)C(O)[C@H](O)[C@@H]1O,1
12+
1,OC[C@H]1OC(O)O[C@H](CO)C1O,1
13+
1,OC[C@H]1OC(O)[C@@H](O)C(O)[C@@H]1O,1
14+
1,OC[C@H]1OC(O)[C@@H](O)[C@@H](O)[C@@H]1O,1
15+
1,OC[C@H]1OC(O)[C@@H](O)[C@H](O)[C@@H]1O,1
16+
1,OC[C@H]1OC(O)[C@H](O)C(O)[C@@H]1O,1
17+
1,OC[C@H]1OC(O)[C@H](O)O[C@@H]1CO,1
18+
1,OC[C@H]1OC(O)[C@H](O)[C@@H](CO)O1,1
19+
1,OC[C@H]1OC(O)[C@H](O)[C@@H](O)C1O,1
20+
1,OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O,1
21+
1,OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@H]1O,1
22+
1,OC[C@H]1OC(O)[C@H](O)[C@H](O)[C@@H]1O,1
23+
1,OC[C@H]1OC(O)[C@H](O)[C@H](O)[C@H]1O,1
24+
1,OC[C@H]1OO[C@H](CO)[C@@H](O)[C@@H]1O,1
25+
1,OC[C@H]1OO[C@H](CO)[C@H](O)[C@@H]1O,1
26+
1,OC[C@H]1O[C@@H](O)C(O)[C@@H](O)[C@@H]1O,1
27+
1,OC[C@H]1O[C@@H](O)C(O)[C@@H](O)[C@H]1O,1
28+
1,OC[C@H]1O[C@@H](O)[C@@H](O)[C@@H](CO)O1,1
29+
1,OC[C@H]1O[C@@H](O)[C@@H](O)[C@@H](O)[C@@H]1O,1
30+
1,OC[C@H]1O[C@@H](O)[C@@H](O)[C@@H](O)[C@H]1O,1
31+
1,OC[C@H]1O[C@@H](O)[C@H](O)[C@@H](CO)O1,1
32+
1,OC[C@H]1O[C@@H](O)[C@H](O)[C@@H](O)C1O,1
33+
1,OC[C@H]1O[C@@H](O)[C@H](O)[C@@H](O)[C@@H]1O,1
34+
1,OC[C@H]1O[C@@H](O)[C@H](O)[C@@H](O)[C@H]1O,1
35+
1,OC[C@H]1O[C@H](O)[C@@H](CO)OC1O,1
36+
1,OC[C@H]1O[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1O,1
37+
1,OC[C@H]1O[C@H](O)[C@@H](O)[C@@H](O)[C@H]1O,1
38+
1,OC[C@H]1O[C@H](O)[C@H](O)C(O)[C@@H]1O,1
39+
1,OC[C@H]1O[C@H](O)[C@H](O)O[C@@H]1CO,1
40+
1,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](CO)O1,1
41+
1,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)C1O,1
42+
1,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O,1
43+
1,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@H]1O,1
44+
1,OC[C@H]1O[C@H](O)[C@H](O)[C@H](O)[C@@H]1O,1
45+
1,OC[C@H]1O[C@H](O)[C@H](O)[C@H](O)[C@H]1O,1
46+
2,N[C@@H](Cc1ccc(O)cc1)C(=O)O,1
47+
2,N[C@@H](Cc1cccc(O)c1)C(=O)O,1
-76 KB
Binary file not shown.

tests/test_databases.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ def test_create_substructure_database(self):
226226
test_db_cursor.execute("SELECT * FROM hmdbid_substructures")
227227
for i, row in enumerate(test_db_cursor.fetchall()):
228228
if i == 0:
229-
self.assertEqual(row, ('HMDB0000073', '*:c(:*)CCN'))
229+
self.assertEqual(row, ('HMDB0000073', 1))
230230
total_rows = i
231231

232232
self.assertEqual(total_rows, 1292)
@@ -262,6 +262,8 @@ def test_update_substructure_database(self): # requires create_compound_databas
262262
self.to_test_results("test_db.sqlite"), 4, 8,
263263
method="exhaustive", isomeric_smiles=True)
264264

265+
shutil.copyfile(self.to_test_data("substructures.sqlite"), self.to_test_results("substructures_copy.sqlite"))
266+
265267
test_db = sqlite3.connect(self.to_test_results("test_db.sqlite"))
266268
test_db_cursor = test_db.cursor()
267269

@@ -296,7 +298,7 @@ def test_update_substructure_database(self): # requires create_compound_databas
296298
test_db_cursor.execute("SELECT * FROM hmdbid_substructures")
297299
for i, row in enumerate(test_db_cursor.fetchall()):
298300
if i == 0:
299-
self.assertEqual(row, ('HMDB0000073', '*:c(:*)CCN'))
301+
self.assertEqual(row, ('HMDB0000073', 1))
300302
total_rows = i
301303

302304
self.assertEqual(total_rows, 1292)

tests/test_substructure_database.py

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,16 @@ def test_init(self):
5050
self.to_test_data("connectivity.sqlite"))
5151

5252
db.cursor.execute("SELECT * FROM substructures")
53-
first_row = db.cursor.fetchone()[0:17]
54-
self.assertEqual(first_row, ('*:c(:*)CCN', 4, 10, 56, 56.05, 56.05002399999998, 3, 6, 1, 0, 0, 0, 2, '{3: 2}',
55-
1, '{3: [1.5, 1.5]}', '[4, 5]'))
53+
first_row = db.cursor.fetchone()[0:18]
54+
self.assertEqual(first_row, (1, '*:c(:*)CCN', 4, 10, 56, 56.05, 56.05002399999998, 3, 6, 1, 0, 0, 0, 2,
55+
'{3: 2}', 1, '{3: [1.5, 1.5]}', '[4, 5]'))
5656

57-
self.assertTrue(Chem.MolFromSmiles(first_row[0], False))
57+
self.assertTrue(Chem.MolFromSmiles(first_row[1], False))
5858
self.assertEqual(len(db.cursor.fetchall()), 1235)
5959

6060
db.cursor.execute("SELECT * FROM hmdbid_substructures")
6161
first_row = db.cursor.fetchone()
62-
self.assertEqual(first_row, ('HMDB0000073', '*:c(:*)CCN'))
62+
self.assertEqual(first_row, ('HMDB0000073', 1))
6363
self.assertEqual(len(db.cursor.fetchall()), 1292)
6464

6565
db.cursor.execute("SELECT * FROM compounds")
@@ -85,43 +85,39 @@ def test_select_compounds(self):
8585
def test_filter_hmdbid_substructures(self):
8686
db = SubstructureDb(self.to_test_data("substructures.sqlite"))
8787
db.filter_hmdbid_substructures(2)
88-
db.cursor.execute("SELECT * FROM unique_hmdbid")
89-
self.assertEqual(db.cursor.fetchall(), [('HMDB0000073',), ('HMDB0000122',), ('HMDB0000158',), ('HMDB0000186',)])
9088

91-
db.cursor.execute("SELECT * FROM filtered_hmdbid_substructures")
92-
for i, unique_substructure in enumerate(db.cursor.fetchall()):
93-
self.assertLessEqual(i, 56)
94-
self.assertGreaterEqual(unique_substructure[1], 2)
89+
db.cursor.execute("SELECT COUNT(*) FROM filtered_hmdbid_substructures GROUP BY hmdbid")
90+
for i, hmdbid_count in enumerate(db.cursor.fetchall()):
91+
self.assertGreater(hmdbid_count[0], 1)
92+
93+
self.assertEqual(i, 3)
9594

9695
db.close()
9796

98-
def test_generate_substructure_network(self): # also tests close
97+
def test_generate_substructure_network(self): # also tests get_substructure_network, get_single_edge and close
9998
db = SubstructureDb(self.to_test_data("substructures.sqlite"))
100-
std = db.generate_substructure_network(method="default", min_node_weight=2, remove_isolated=False)
101-
extended = db.generate_substructure_network(method="extended", min_node_weight=2, remove_isolated=False)
102-
parent = db.generate_substructure_network(method="parent_structure_linkage", min_node_weight=2,
103-
remove_isolated=False)
10499

105-
for s in std.nodes:
106-
self.assertTrue(s in extended.nodes and s in parent.nodes)
100+
self.assertEqual(db.get_single_edge([3, 4, 2]), {3: {3: None, 4: 2}, 2: {3: 1, 4: 1, 2: None}, 4: {4: None}})
107101

108-
db.cursor.execute("select * from unique_hmdbid")
109-
edge_count = []
102+
std = db.generate_substructure_network(min_node_weight=2, return_networkx=True)
103+
104+
db.cursor.execute("SELECT * FROM filtered_hmdbid_substructures")
110105
for hmdb in db.cursor.fetchall():
111-
self.assertTrue(hmdb[0] in parent.nodes)
112-
edge_count.append(len(parent.edges(hmdb[0])))
113106

114-
db.cursor.execute("select distinct smiles from filtered_hmdbid_substructures")
107+
self.assertTrue(hmdb[1] in std.nodes)
108+
109+
db.cursor.execute("SELECT DISTINCT substructure_id FROM filtered_hmdbid_substructures")
115110
self.assertEqual(len(db.cursor.fetchall()), 57)
116111
self.assertEqual(std.number_of_nodes(), 57)
117-
self.assertEqual(extended.number_of_nodes(), 57)
118-
self.assertEqual(parent.number_of_nodes() - 4, 57)
119112

120113
self.assertEqual(std.number_of_edges(), 1024)
121-
self.assertEqual(extended.number_of_edges(), 1024)
122114

123-
self.assertEqual(parent.number_of_edges(), 114)
124-
self.assertEqual(sum(edge_count), 114)
115+
edge_count = []
116+
db.cursor.execute("SELECT * FROM substructure_graph")
117+
for edge in db.cursor.fetchall():
118+
edge_count.append(std.get_edge_data(edge[0], edge[1])["weight"])
119+
120+
self.assertEqual(sum(edge_count), 2048)
125121

126122
db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
127123
self.assertEqual(len(db.cursor.fetchall()), 5)
@@ -136,7 +132,7 @@ def test_generate_substructure_network(self): # also tests close
136132

137133
db = SubstructureDb(self.to_test_data("substructures.sqlite"))
138134
db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
139-
self.assertEqual(len(db.cursor.fetchall()), 3)
135+
self.assertEqual(len(db.cursor.fetchall()), 4)
140136

141137
db.close()
142138

@@ -204,14 +200,24 @@ def test_select_substructures(self):
204200

205201
substructures = list(db.select_substructures([[4, 4, 0, 0, 0, 0]], "substructures")[0][0].values())
206202
self.assertEqual([item for i, item in enumerate(substructures) if i != 1],
207-
['*C(*)Cc(:*)c:*', {1: [1.0, 1.0], 3: [1.5], 6: [1.5]}, {1: 2, 3: 1, 6: 1}, 4, 3, [0, 4, 5, 7]])
203+
['*Cc(:*)cc:*',
204+
{1: [1.0], 2: [1.5], 5: [1.5]},
205+
{1: 1, 2: 1, 5: 1},
206+
3,
207+
3,
208+
[0, 3, 4]])
208209

209210
self.assertEqual(len(db.select_substructures([[7, 7, 0, 0, 0, 0]], "substructures")[0]), 3)
210211
self.assertEqual(list(db.select_substructures([[7, 7, 0, 0, 0, 0]], "substructures")[0][0].keys()),
211212
['smiles', 'mol', 'bond_types', 'degree_atoms', 'valence', 'atoms_available', 'dummies'])
212213
substructures = list(db.select_substructures([[7, 7, 0, 0, 0, 0]], "substructures")[0][0].values())
213214
self.assertEqual([item for i, item in enumerate(substructures) if i != 1],
214-
['*C(*)Cc1cc:*:cc1', {1: [1.0, 1.0], 5: [1.5], 7: [1.5]}, {1: 2, 5: 1, 7: 1}, 4, 3, [0, 6, 9]])
215+
['*CCc1c:*:c(*)cc1',
216+
{1: [1.0], 4: [1.5], 6: [1.5, 1.0]},
217+
{1: 1, 4: 1, 6: 2},
218+
4,
219+
3,
220+
[0, 5, 7]])
215221

216222
self.assertRaises(sqlite3.OperationalError,
217223
lambda: db.select_substructures([[2, 5, 0, 0, 0, 0]], "substrusctures"))

tests/test_suite_build_structures.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,4 @@
3636
suite.addTest(unittest.findTestCases(test_build_structures))
3737

3838
report = os.path.join(os.path.abspath(os.path.join(__file__, os.pardir)), 'results', 'results_test_suite_build_structures')
39-
runTestSuite(suite, report, title='Process Test Suite Report',verbosity=2)
39+
runTestSuite(suite, report, title='Process Test Suite Report', verbosity=2)

0 commit comments

Comments
 (0)