diff --git a/.coveragerc b/.coveragerc index 7088fec..6a27c02 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,2 +1,2 @@ [run] -omit = tests/*,setup.py,metaboblend/__main__.py \ No newline at end of file +omit = tests/*,setup.py,metaboblend/__main__.py,notebooks/ \ No newline at end of file diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index fbecba7..27fe167 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -8,8 +8,8 @@ jobs: strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: [3.6, 3.7, 3.8] + os: [ ubuntu-latest, windows-latest, macos-latest ] + python-version: [ 3.7, 3.8, 3.9 ] env: OS: ${{ matrix.os }} @@ -19,21 +19,29 @@ jobs: - uses: actions/checkout@v2 - name: Setup conda - Python ${{ matrix.python-version }} - uses: s-weigand/setup-conda@v1 + uses: conda-incubator/setup-miniconda@v2 with: - update-conda: true + auto-update-conda: true + activate-environment: metaboblend python-version: ${{ matrix.python-version }} - conda-channels: anaconda, conda-forge + environment-file: environment.yml + channels: anaconda, conda-forge - - name: Install dependencies + - name: Build MetaboBlend + shell: bash -l {0} run: | + python setup.py install + metaboblend --help - python --version - conda env update --file environment.yml --name base + - name: Test with pytest-cov + shell: bash -l {0} + run: | + conda install pytest codecov pytest-cov -c conda-forge + pytest --cov ./ --cov-config=.coveragerc --cov-report=xml - name: Lint with flake8 + shell: bash -l {0} run: | - conda install flake8 # stop build if there are Python syntax errors or undefined names @@ -42,15 +50,6 @@ jobs: # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest-cov - run: | - - python setup.py install - metaboblend --help - - conda install pytest codecov pytest-cov -c conda-forge - pytest --cov ./ --cov-config=.coveragerc --cov-report=xml - - name: Upload code coverage to codecov uses: codecov/codecov-action@v1 with: diff --git a/.gitignore b/.gitignore index 640be3e..ad6f045 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,8 @@ target/ # Jupyter Notebook .ipynb_checkpoints +notebooks/notebook_data +notebooks/notebook_data/* # pyenv .python-version @@ -105,4 +107,4 @@ ENV/ # ignore test files */libgcc_s_dw2-1.dll */libstdc++-6.dll -tests/test* +tests/tmp* diff --git a/README.rst b/README.rst index ca4f8d5..a9b724a 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,9 @@ MetaboBlend =========== -|Version| |Py versions| |Git| |Bioconda| |Build Status| |License| |RTD doc| |codecov| |binder| +.. + |Version| |Py versions| |Bioconda| |RTD doc| |License| |binder| + +|Git| |Build Status| |codecov| Python package for *de novo* structural elucidation of small molecules in mass spectrometry-based Metabolomics @@ -32,12 +35,11 @@ will help you to make the PR if you are new to `git`. Developers & Contributors ------------------------- - Ralf J. M. Weber (r.j.weber@bham.ac.uk) - `University of Birmingham (UK) `_ - - Jack Gisby (jackgisby@gmail.com) - `University of Birmingham (UK) `_ - + - Jack Gisby (jackgisby@gmail.com) - `University of Birmingham (UK) `_, `Imperial College London (UK) `_ Licenses -------- -MetaboBlend is licensed under the GNU General Public License v3.0 (see `LICENSE file `_ for licensing information). Copyright © 2019 - 2020 Ralf Weber +MetaboBlend is licensed under the GNU General Public License v3.0 (see `LICENSE file `_ for licensing information). Copyright © 2019 - 2020 Jack Gisby, Ralf Weber .. |Build Status| image:: https://github.com/computational-metabolomics/metaboblend/workflows/metaboblend/badge.svg diff --git a/binder/environment.yml b/binder/environment.yml new file mode 100644 index 0000000..8b3f2c2 --- /dev/null +++ b/binder/environment.yml @@ -0,0 +1,17 @@ +name: metaboblend +channels: + - conda-forge + - bioconda +dependencies: + - python=3.7 + - numpy + - scipy + - pandas + - networkx + - rdkit + - biopython + - matplotlib + - nauty + - pip + - pip: + - -e ../ diff --git a/docs/source/conf.py b/docs/source/conf.py index 562abf6..5ab6491 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -20,7 +20,7 @@ # -- Project information ----------------------------------------------------- project = 'MetaboBlend' -copyright = '2020, Ralf Weber' +copyright = '2020, Jack Gisby, Ralf Weber' author = 'Jack Gisby, Ralf Weber' # -- General configuration --------------------------------------------------- diff --git a/docs/source/license.rst b/docs/source/license.rst index f9f1639..1598200 100644 --- a/docs/source/license.rst +++ b/docs/source/license.rst @@ -1,4 +1,4 @@ License ------- TODO: change package name -*MetaboBlend* is licensed under the GNU General Public License v3.0 (see `LICENSE file `_ for licensing information). Copyright © 2019 - 2020 Ralf Weber +*MetaboBlend* is licensed under the GNU General Public License v3.0 (see `LICENSE file `_ for licensing information). Copyright © 2019 - 2020 Jack Gisby, Ralf Weber diff --git a/environment.yml b/environment.yml index 72bb2b8..468db5f 100644 --- a/environment.yml +++ b/environment.yml @@ -1,14 +1,12 @@ name: metaboblend channels: - conda-forge - - bioconda dependencies: - - python>=3.6 + - python>=3.7 + - pillow!=9.2.0 + - pyqt + - matplotlib - numpy - - scipy - - pandas - networkx - rdkit - - biopython - - matplotlib - nauty diff --git a/metaboblend/__init__.py b/metaboblend/__init__.py index 1bcae72..38ba8cb 100644 --- a/metaboblend/__init__.py +++ b/metaboblend/__init__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright © 2019-2020 Ralf Weber +# Copyright © 2019-2020 Jack Gisby, Ralf Weber # # This file is part of MetaboBlend. # @@ -19,7 +19,7 @@ # along with MetaboBlend. If not, see . # -__author__ = 'Ralf Weber (r.j.weber@bham.ac.uk)' -__credits__ = 'Ralf Weber (r.j.weber@bham.ac.uk)' +__authors__ = ['Ralf Weber (r.j.weber@bham.ac.uk)', 'Jack Gisby (jackgisby@gmail.com)'] +__credits__ = ['Ralf Weber (r.j.weber@bham.ac.uk)', 'Jack Gisby (jackgisby@gmail.com)'] __version__ = '0.1.0' __license__ = 'GPLv3' diff --git a/metaboblend/__main__.py b/metaboblend/__main__.py index 65d2945..f3bb6cf 100644 --- a/metaboblend/__main__.py +++ b/metaboblend/__main__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright © 2019-2020 Ralf Weber +# Copyright © 2019-2020 Jack Gisby, Ralf Weber # # This file is part of MetaboBlend. # diff --git a/metaboblend/algorithms.py b/metaboblend/algorithms.py index 624b00a..97dd6d5 100644 --- a/metaboblend/algorithms.py +++ b/metaboblend/algorithms.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright © 2019-2020 Ralf Weber +# Copyright © 2019-2020 Jack Gisby, Ralf Weber # # This file is part of MetaboBlend. # @@ -20,9 +20,10 @@ # import numpy +from math import sqrt -def find_path(mass_list, sum_matrix, n, mass, max_subset_length, path=[]): +def find_path(mass_list, sum_matrix, n, mass, max_subset_length, path=None): """ Recursive solution for backtracking through the dynamic programming boolean matrix. All possible subsets are found @@ -42,6 +43,9 @@ def find_path(mass_list, sum_matrix, n, mass, max_subset_length, path=[]): :return: Generates of lists containing the masses of valid subsets. """ + if path is None: + path = [] + # base case - the path has generated a correct solution if mass == 0: yield sorted(path) @@ -103,3 +107,29 @@ def subset_sum(mass_list, mass, max_subset_length=3): # backtrack through the matrix recursively to obtain all solutions return find_path(mass_list, sum_matrix, n, mass, max_subset_length) + + +def cosine_spectrum_similarity(real_mzs, candidate_mzs): + """ + Database fragmentation scoring based on the cosine similarity method. Adapted for the lack of intensities + available for the candidate compound. + + :param real_mzs: The mz values for the original MSn spectrum. + + :param candidate_mzs: The theoretical mz values for a candidate compound. Should have the same order as `real_mzs` + and should have a value of `0` when there is no match for the candidate for a peak in the original spectrum. + + :return: Similarity metric for the two spectra. + """ + + # get weighted peaks + real_weighted = [(mz ** 2) for mz in real_mzs] + candidate_weighted = [(mz ** 2) for mz in candidate_mzs] + + def dot(E, D): + return sum(e * d for e, d in zip(E, D)) + + def cosine_similarity(E, D): + return dot(E, D) / (sqrt(dot(E, E)) * sqrt(dot(D, D))) + + return cosine_similarity(real_weighted, candidate_weighted) diff --git a/metaboblend/build_structures.py b/metaboblend/build_structures.py deleted file mode 100644 index 16987dc..0000000 --- a/metaboblend/build_structures.py +++ /dev/null @@ -1,890 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright © 2019-2020 Ralf Weber -# -# This file is part of MetaboBlend. -# -# MetaboBlend is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# MetaboBlend is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with MetaboBlend. If not, see . -# - -import os -import copy -import numpy -import itertools -import multiprocessing -import networkx as nx -from functools import partial -from operator import itemgetter -from typing import Sequence, Dict, Union - -from rdkit import Chem - -from .results import ResultsDb -from .parse import parse_ms_data -from .algorithms import subset_sum -from .databases import SubstructureDb - - -def combine_mfs(precise_mass_grp, db, table_name, accuracy): - """ - A wrapper for :py:meth:`metaboblend.databases.select_ecs` that instead takes a group of subsets, as generated by - the second stage of :py:meth:`metaboblend.build_structures.subset_sum` in - :py:meth:`metaboblend.build_structures.build`. - - :param precise_mass_grp: A list containing the masses of substructures identified by subset_sum. - - :param db: The :py:meth:`metaboblend.databases.SubstructureDb` in which to search for elemental compositions. - - :param table_name: The name of the table containing substructures in which to search for elemental compositions. - - :param accuracy: To which decimal places of accuracy results are to be limited to. - - * **1** Integer level - * **0_0001** Four decimal places - - :return: If there are no elemental compositions for any of the masses in the group, then an empty list is returned. - """ - - ecs = [] - - for i in range(len(precise_mass_grp)): - atoms = db.select_mfs(precise_mass_grp[i], table_name, accuracy) - - if len(atoms) == 0: - return [] - - ecs.append(atoms) - - return ecs - - -def reindex_atoms(records): - """ - Parses the libs of groups of substructures that are to be combined; the lib is a dictionary containing details - about the substructure, as generated by :py:meth:`metaboblend.databases.get_substructure`. Combines the - molecules into a single :py:meth:`rdkit.Chem.Mol` object and obtains details on their bonding properties. - - :param records: Takes a list of lib dictionaries that contain details on each substructure to be combined. - - :return: Returns a tuple containing a :py:meth:`rdkit.Chem.CombineMols` object, that stores all the substructures - as a single molecule, followed by information on the substructure bonding properties, including: - - * **atoms_available** A list of the indices of atoms that are available for bonding. - - * **atoms_to_remove** A list of the indices of dummy atoms that are to be removed in order to bond with other - substructures. - - * **bond_types** A dictionary containing the indices of atoms that are available for bonding as keys and values - detailing their bond types. See :py:meth:`metaboblend.build_structures.add_bonds`. - """ - - atoms_available, atoms_to_remove, bond_types = [], [], {} - mol_comb = Chem.Mol() - index_atoms, all_bond_types = [], {} - c = 0 - - for i, record in enumerate(records): - idxs = [] - all_bond_types[i] = [] - for atom in record["mol"].GetAtoms(): - - new_idx = atom.GetIdx() + c - idxs.append(new_idx) - - if atom.GetIdx() in record["degree_atoms"]: - atoms_available.append(new_idx) - - if atom.GetIdx() in record["dummies"]: - atoms_to_remove.append(new_idx) - - if atom.GetIdx() in record["bond_types"]: - bond_types[new_idx] = record["bond_types"][atom.GetIdx()] - all_bond_types[i] += record["bond_types"][atom.GetIdx()] - - mol_comb = Chem.CombineMols(mol_comb, record["mol"]) - index_atoms.append(idxs) - c = idxs[-1] + 1 - - # check that bond types add up - removes some mismatched configurations - bond_mismatch = False - for i in range(len(records)): - other_bonds = [] - for j in range(len(records)): - if i != j: - other_bonds += all_bond_types[j] - - for bond in all_bond_types[i]: - if bond not in other_bonds: - bond_mismatch = True - - return mol_comb, atoms_available, atoms_to_remove, bond_types, bond_mismatch - - -def add_bonds(mols, edges, atoms_available, bond_types, bond_enthalpies): - """ - Takes a set of substructures and attempts to combine them together to generate a final structure. One of the last - steps in the :py:meth:`metaboblend.build_structures.build` workflow. - - :param mols: A :py:meth:`rdkit.Chem.CombineMols` object, that stores all the substructures - as a single molecule. - - :param edges: The edges to use in order to join the substructures together, obtained from the connectivity database - (:py:meth:`metaboblend.databases.create_isomorphism_database`). - - :param atoms_available: A list of the indices of atoms that are available for bonding. - - :param bond_types: The type of bonds to be formed by dummy atoms - see :py:meth:`Chem.rdchem.BondType`. Is a - dictionary whose keys are atom indices and values are bond types, as follows: - - * **1.0** Single - - * **1.5** Aromatic - - * **2.0** Double - - :param bond_enthalpies: Dictionary of bond enthalpies, as generated by - :py:meth:`metaboblend.build_structures.get_bond_enthalpies`. - - :return: If unsuccessful, returns None, else returns an :py:meth:`rdkit.Chem.EditableMol` object containing - the substructures combined into a final single molecule. - """ - - rdkit_bond_types = {1: Chem.rdchem.BondType.SINGLE, - 1.5: Chem.rdchem.BondType.AROMATIC, - 2: Chem.rdchem.BondType.DOUBLE} - - bond_types_copy = copy.deepcopy(bond_types) # deep copy as we modify items within the dict - - g = nx.Graph() - g.add_edges_from(edges) - - g = nx.relabel_nodes(g, dict(zip(sorted(g.nodes()), atoms_available))) - - total_bde = 0 - - mol_edit = Chem.EditableMol(mols) - for edge in g.edges(): - - if edge[0] in bond_types_copy: - bt_start = bond_types_copy[edge[0]] - else: - return None, None # nested dummy - - if edge[1] in bond_types_copy: - bt_end = bond_types_copy[edge[1]] - else: - return None, None # nested dummy - - bond_matches = list(set(bt_start).intersection(bt_end)) - - if len(bond_matches) == 0: - return None, None - - else: - bt_start.remove(bond_matches[0]) - bt_end.remove(bond_matches[0]) - - try: # try forming the specified bond - mol_edit.AddBond(edge[0], edge[1], rdkit_bond_types[bond_matches[0]]) - except KeyError: - return None, None # unknown bond type - - # calculate bond dissociation energy of "formed" bonds for the structure - try: - total_bde += bond_enthalpies[bond_matches[0]][mols.GetAtomWithIdx(edge[0]).GetSymbol()][mols.GetAtomWithIdx(edge[1]).GetSymbol()] - except (SyntaxError, TypeError): - total_bde = None - - return mol_edit, total_bde - - -def annotate_msn(msn_data: Union[str, os.PathLike, Dict[str, Dict[str, Union[int, list]]]], - path_substructure_db: Union[str, bytes, os.PathLike] = os.path.realpath(os.getcwd()), - path_out: Union[str, bytes, os.PathLike] = "", - ppm: int = 5, - ha_min: Union[int, None] = None, - ha_max: Union[int, None] = None, - max_atoms_available: int = 2, - max_degree: int = 6, - max_n_substructures: int = 3, - path_connectivity_db: Union[str, bytes, os.PathLike, None] = None, - ncpus: Union[int, None] = None, - minimum_frequency: Union[int, None] = None, - hydrogenation_allowance: int = 2, - yield_smis: bool = True, - isomeric_smiles: bool = False, - write_csv_output: bool = True, - retain_substructures: bool = False - ) -> Dict[str, Sequence[Dict[str, int]]]: - """ - Generate molecules of a given mass using chemical substructures, connectivity graphs and spectral trees or - fragmentation spectra. Final structures and rankings are yielded by the function as a dictionary and/or written in - text format. For the generation of structures without MSn data, see - :py:meth:`metaboblend.build_structures.generate_structures`. - - :param msn_data: Either a dictionary or the path to an MSP file. MSP files are parsed by - :py:meth:`metaboblend.parse.parse_ms_data` before being converted into a dictionary. If a dictionary is - provided, it must contain one item per fragmentation spectrum; the keys of the dictionary should be a unique ID - for the query and the corresponding value must itself be a dictionary, containing the following: - - - "exact_mass": `float` (neutral mass of query) OR "precursor_mz": `float` (mz of precursor ion) - - "mf": `[C, H, N, O, P, S]` (a list of 6 integers) - - "neutral_fragment_masses": `[float, float, ...]` (list of neutral fragment masses) OR "fragment_mzs": - `[float, float, ...]` (list of fragment mzs) - - "precursor_type": `str` (e.g. "[M+H]+", required for calculating neutral masses from ion mzs) - - The dictionary or MSP path is fed to :py:meth:`metaboverse.parse.parse_ms_data`. - - :param path_substructure_db: The path to the SQLite 3 substructure database, as generated by - :py:meth:`metaboblend.databases.SubstructureDb`. - - :param path_out: Folder to which the SQLite 3 results database and CSV outputs should be written. - - :param ppm: The maximal tolerated m/z deviation (in parts per million) of the mass of substructures from the - supplied `fragment_masses`. - - :param ha_min: The minimum size (number of heavy atoms) of substructures to be used to build final structures. If - None, no limit is applied. - - :param ha_max: The maximum size (number of heavy atoms) of substructures to be used to build final structures. If - None, no limit is applied. - - :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for - building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming - chemical bonds (e.g. single or double bonds). `atoms_available` are also limited by the extensivity of the - supplied connectivity database. - - :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We - define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, - 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For - instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have - a total degree of 3. - - :param max_n_substructures: The maximum number of substructures to be used for building molecules. The max number - of substructures is also limited by the extensivity of the supplied connectivity database. - - :param path_connectivity_db: The path to the SQLite 3 connectivity database, as generated by - :py:meth:`metaboblend.databases.create_isomorphism_database`. If the path is None, the default connectivity - database bundled with MetaboBlend will be used. - - :param ncpus: How many CPUs to utilise; if left as None, :py:meth:`os.cpu_count` is used. - - :param minimum_frequency: The minimum frequency of substructures in table_name; e.g. substructures have a frequency - of 1 if they are unique. Defaults to None, in which case this filtering method is not applied. - - :param hydrogenation_allowance: In order to represent re-arrangement events (the movement of hydrogens), in - addition to attempting to build from substructures in prescribed_masses, we also attempt to build from - `fragment_masses +- hydrogenation_allowance`. E.g. if `prescribed_masses = [141.5938]` and - `hydrogenation_allowance = 1` then, to find candidate fragment substructures, we use as query the masses - `[141.5938 - 1.007825, 141.5938, 141.5938 + 1.007825]`. - - :param yield_smis: If True, for each input molecule the function yields SMILEs the number of `fragment_masses` by - which the structure was generated. Else, returns None. - - :param isomeric_smiles: If True, writes smiles with non-structural isomeric information. - - :param write_csv_output: Whether to extract results from the SQLite3 database for deposition in CSV files. - - :param retain_substructures: Whether to record the substructures used to generate final structures. - - :return: For each input molecule yields a dictionary whose keys are SMILEs strings for the generated - structures and values are the number of `fragment_masses` by which the structure was built (unless - `yield_smi_dict = False`). - """ - - if ppm is None: - ppm = 0 - - if path_connectivity_db is None: - path_connectivity_db = os.path.join(os.path.realpath(os.path.dirname(__file__)), "data", "connectivity.sqlite") - - db = SubstructureDb(path_substructure_db, path_connectivity_db) - results_db = ResultsDb(path_out) - results_db.create_results_db() - - # prepare temporary table here - will only be generated once in case of multiple input - table_name = gen_subs_table( - db=db, - ha_min=ha_min, - ha_max=ha_max, - max_degree=max_degree, - max_atoms_available=max_atoms_available, - minimum_frequency=minimum_frequency, - max_mass=None - ) - - for i, ms in enumerate(parse_ms_data(msn_data)): - - if ms is None: - continue - - results_db.add_ms(msn_data, ms["ms_id"], i, - [ppm, ha_min, ha_max, max_atoms_available, max_degree, max_n_substructures, hydrogenation_allowance, isomeric_smiles]) - - for j, fragment_mass in enumerate(ms["neutral_fragment_masses"]): - - for k in range(0 - hydrogenation_allowance, hydrogenation_allowance + 1): - hydrogenated_fragment_mass = fragment_mass + (k * 1.007825) # consider re-arrangements - - smi_dict = build( - mf=ms["mf"], - exact_mass=ms["exact_mass"], - max_n_substructures=max_n_substructures, - path_connectivity_db=path_connectivity_db, - path_substructure_db=path_substructure_db, - prescribed_mass=hydrogenated_fragment_mass, - ppm=ppm, - table_name=table_name, - ncpus=ncpus, - clean=False, - isomeric_smiles=isomeric_smiles, - retain_substructures=retain_substructures - ) - - results_db.add_results(i, smi_dict, fragment_mass, j, retain_substructures) - smi_dict = None - - results_db.calculate_frequencies(i) - - if yield_smis: - yield {ms["ms_id"]: results_db.get_structures(i)} - - if write_csv_output: - results_db.generate_csv_output() - - db.close() - results_db.close() - - -def generate_structures(ms_data: Union[str, os.PathLike, Dict[str, Dict[str, Union[int, None]]]], - path_substructure_db: Union[str, bytes, os.PathLike], - path_out: Union[str, bytes, os.PathLike] = os.path.realpath(os.getcwd()), - ha_min: Union[int, None] = 2, - ha_max: Union[int, None] = 9, - max_degree: int = 6, - max_atoms_available: int = 2, - max_n_substructures: int = 3, - ncpus: Union[int, None] = None, - path_connectivity_db: Union[str, bytes, os.PathLike, None] = None, - minimum_frequency: Union[int, None] = None, - yield_smis: bool = True, - isomeric_smiles: bool = False, - write_csv_output: bool = True, - retain_substructures: bool = False - ) -> Dict[str, Sequence[set]]: - """ - Generate molecules of a given mass using chemical substructures and connectivity graphs. Can optionally take a - "prescribed" fragment mass to further filter results. Final structures are returned as a list and/or written in - text format. For the generation of structures from MSn data, see - :py:meth:`metaboblend.build_structures.annotate_msn`. - - :param ms_data: A dictionary that must contain one item per fragmentation spectrum; the keys of the dictionary - should be a unique ID for the query and the corresponding value must itself be a dictionary, containing the - following: - - - "exact_mass": `float` (neutral mass of query) OR "precursor_mz": `float` (mz of precursor ion) - - "mf": `[C, H, N, O, P, S]` (a list of 6 integers) - - "precursor_type": `str` (e.g. "[M+H]+", required for calculating neutral masses from ion mzs) - - (optional) "prescribed_mass": 'float' (neutral mass of substructure). - - The dictionary or MSP path is fed to :py:meth:`metaboverse.parse.parse_ms_data`. A single neutral substructure - mass may be provided ("prescribed_mass") to guide the structure generation process. - - :param path_substructure_db: The path to the SQLite 3 substructure database, as generated by - :py:meth:`metaboblend.databases.SubstructureDb`. - - :param path_out: Folder to which the SQLite 3 results database and CSV outputs should be written. - - :param ha_min: The minimum size (number of heavy atoms) of substructures to be used to build final structures. If - None, no limit is applied. - - :param ha_max: The maximum size (number of heavy atoms) of substructures to be used to build final structures. If - None, no limit is applied. - - :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We - define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, - 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For - instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have - a total degree of 3. - - :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for - building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming - chemical bonds (e.g. single or double bonds). `atoms_available` are also limited by the extensivity of the - supplied connectivity database. - - :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We - define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, - 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For - instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have - a total degree of 3. - - :param max_n_substructures: The maximum number of substructures to be used for building molecules. The max number - of substructures is also limited by the extensivity of the supplied connectivity database. - - :param path_connectivity_db: The path to the SQLite 3 connectivity database, as generated by - :py:meth:`metaboblend.databases.create_isomorphism_database`. If the path is None, the default connectivity - database bundled with MetaboBlend will be used. - - :param ncpus: How many worker processes to utilise; if left as None, :py:meth:`os.cpu_count` is used. - - :param minimum_frequency: The minimum frequency of substructures in table_name; e.g. substructures have a frequency - of 1 if they are unique. Defaults to None, in which case this filtering method is not applied. - - :param yield_smis: If True, yields a set of unique SMILEs string for each input molecule, else returns None. - - :param isomeric_smiles: If True, writes smiles with non-structural isomeric information. - - :param write_csv_output: Whether to extract results from the SQLite3 database for deposition in CSV files. - - :param retain_substructures: Whether to record the substructures used to generate final structures. - - :return: For each input molecule, yields unique SMILEs strings (unless `yield_smis = False`). - """ - - db = SubstructureDb(path_substructure_db, path_connectivity_db) - results_db = ResultsDb(path_out, False) - results_db.create_results_db() - - if path_connectivity_db is None: - path_connectivity_db = os.path.join(os.path.realpath(os.path.dirname(__file__)), "data", "connectivity.sqlite") - - # prepare temporary table here - will only be generated once in case of multiple input - table_name = gen_subs_table( - db=db, - ha_min=ha_min, - ha_max=ha_max, - max_degree=max_degree, - max_atoms_available=max_atoms_available, - minimum_frequency=minimum_frequency, - max_mass=round(max([ms_data[ms_id]["exact_mass"] for ms_id in ms_data.keys()])) - ) - - for i, ms in enumerate(parse_ms_data(ms_data, False)): - - results_db.add_ms(ms_data, ms["ms_id"], i, - [None, ha_min, ha_max, max_atoms_available, max_degree, max_n_substructures, None, isomeric_smiles]) - - ppm = None - - try: - if ms["prescribed_mass"] is not None: - ppm = 0 - except KeyError: - ms["prescribed_mass"] = None - - smi_dict = build( - mf=ms["mf"], - exact_mass=ms["exact_mass"], - max_n_substructures=max_n_substructures, - path_connectivity_db=path_connectivity_db, - path_substructure_db=path_substructure_db, - prescribed_mass=ms["prescribed_mass"], - ppm=ppm, - table_name=table_name, - ncpus=ncpus, - clean=False, - isomeric_smiles=isomeric_smiles, - retain_substructures=retain_substructures - ) - - results_db.add_results(i, smi_dict, ms["prescribed_mass"]) - smi_dict = None - - results_db.calculate_frequencies(i) - - if yield_smis: - yield {ms["ms_id"]: results_db.get_structures(i)} - - if write_csv_output: - results_db.generate_csv_output() - - db.close() - - -def build(mf, exact_mass, max_n_substructures, path_connectivity_db, path_substructure_db, - prescribed_mass, ppm, ncpus, table_name, clean, isomeric_smiles, retain_substructures): - """ - Core function for generating molecules of a given mass using substructures and connectivity graphs. Can optionally - take a "prescribed" fragment mass to further filter results; this can be used to incorporate MSn data. Final - molecules are written to the specified file and/or returned in smiles format. This function represents the central - building method used by MetaboBlend; :py:meth:`metaboblend.build_structures.generate_structures` provides a simple - interface for generating structures of a given mass whilst - :py:meth:`metaboblend.build_structures.annotate_msn` allows for the generation and scoring of structures using - information from fragmentation spectra. - - :param mf: List of integers detailing the molecular formula of the target metabolite, in the format - [C, H, N, O, P, S]. - - :param exact_mass: The exact mass (float) of the target metabolite. - - :param max_n_substructures: The maximum number of substructures to be used for building molecules. - - :param path_substructure_db: The path to the SQLite 3 substructure database, as generated by - :py:meth:`metaboblend.databases.SubstructureDb`. - - :param path_connectivity_db: The path to the SQLite 3 connectivity database, as generated by - :py:meth:`metaboblend.databases.create_isomorphism_database`. - - :param prescribed_mass: A mass by which to filter results; if not provided, all possible structures will be - generated. - - :param ppm: The maximal tolerated m/z deviation (in parts per million) of the mass of substructures from the - supplied `fragment_masses`. - - :param ncpus: How many CPUs to utilise; if left as None, :py:meth:`os.cpu_count` is used. - - :param table_name: The table specified within the substructure database will be used to generate - molecules. Will be removed after structures have been built, unless 'clean = False' is set. - - :param clean: Whether to remove the temporary table of substructures, table_name`, after the method is complete. - - :param isomeric_smiles: If True, writes smiles with non-structural isomeric information. - - :param retain_substructures: Whether to record the substructures used to generate final structures. - - :return: Returns a set of unique SMILEs strings. - """ - - db = SubstructureDb(path_substructure_db, path_connectivity_db) - tolerance = 0.001 - - if prescribed_mass is None: - exact_mass__1 = round(exact_mass) - - else: # prescribed substructure build method - if ((prescribed_mass / 1000000) * ppm) > 0.001: - fragment_tolerance = round((prescribed_mass / 1000000) * ppm, 4) - else: - fragment_tolerance = 0.001 - - prescribed_subset = db.select_mass_values("0_0001", [round(prescribed_mass)], table_name) - prescribed_subset = [m for m in prescribed_subset[0] if abs(m - prescribed_mass) <= fragment_tolerance] - - if len(prescribed_subset) == 0: - return {} - - loss = exact_mass - prescribed_mass - exact_mass__1 = round(loss) - - if ((exact_mass / 1000000) * ppm) > 0.001: - tolerance = round((exact_mass / 1000000) * ppm, 4) - - max_n_substructures -= 1 # we find sets of mols that add up to the loss, not the precursor mass - - if os.name == "nt": # multiprocessing freeze support on windows - multiprocessing.freeze_support() - - # select groups of masses at low mass resolution - integer_mass_values = [m for m in db.select_mass_values("1", [], table_name) if m <= exact_mass__1] - if len(integer_mass_values) == 0: - return {} - - integer_subsets = list(subset_sum(integer_mass_values, exact_mass__1, max_n_substructures)) - - configs_iso = db.k_configs() - - substructure_subsets = [] - for integer_subset in integer_subsets: - if len(integer_subset) > max_n_substructures or len(integer_subset) == 0: - continue - - # refine groups of masses to 4dp mass resolution - exact_mass_values = db.select_mass_values("0_0001", integer_subset, table_name) - - if prescribed_mass is not None: - exact_mass_values = [prescribed_subset] + exact_mass_values - - # use combinations to get second group of masses instead of subset sum - subset sum is integer mass only - exact_subsets = [] - for mass_combo in itertools.product(*exact_mass_values): - if abs(sum(mass_combo) - exact_mass) <= tolerance: - exact_subsets.append(mass_combo) - - if len(exact_subsets) == 0: - continue - - # refines groups based on ecs and gets substructures from db (appends to substructure_subsets) - for exact_subset in exact_subsets: - substructure_subsets += build_from_subsets(exact_subset, mf=mf, table_name=table_name, db=db) - - with multiprocessing.Pool(processes=ncpus) as pool: # send sets of substructures for building - smi_dicts = pool.map( - partial(substructure_combination_build, configs_iso=configs_iso, - prescribed_structure=prescribed_mass, isomeric_smiles=isomeric_smiles, - bond_enthalpies=get_bond_enthalpies(), retain_substructures=retain_substructures), - substructure_subsets - ) - - smi_dict = {} - for d in smi_dicts: - for k in d.keys(): - try: - smi_dict[k]["bdes"] += d[k]["bdes"] - - if retain_substructures: - smi_dict[k]["substructures"] += d[k]["substructures"] - - except KeyError: - smi_dict[k] = d[k] - - db.close(clean) - - return smi_dict - - -def gen_subs_table(db, ha_min, ha_max, max_degree, max_atoms_available, max_mass, table_name="subset_substructures", - minimum_frequency=None): - """ - Generate a temporary secondary substructure table restricted by a set of parameters. Generated as an initial step - in :py:meth:`metaboblend.build_structures.generate_structures` and - :py:meth:`metaboblend.build_structures.annotate_msn` in order to limit the processing overhead as a result of - repeatedly querying the SQLite substructure database. - - :param max_mass: The maximum allowed mass of substructures in the temporary table; there is no point considering - substructures with greater mass than the target mol. - - :param db: Connection to a :py:meth:`metaboblend.databases.SubstructureDb` from which to extract substructures. - - :param ha_min: Minimum value of `heavy_atoms` for substructures to be transferred into the temporary table. - If None, no limit is applied. - - :param ha_max: Maximum value of `heavy_atoms` for substructures to be transferred into the temporary table. - If None, no limit is applied. - - :param max_degree: The maximum total valence (ie, the product of `atoms_available` and the degree of their bonds) - to be included in the temporary table. - - :param max_atoms_available: The maximal atoms available of substructures to be included in the temporary table. - - :param table_name: Defaults to "subset_substructures", which is cleaned up upon database closure. The name of the - table to be generated - - :param minimum_frequency: The minimum frequency of substructures in table_name; e.g. substructures have a frequency - of 1 if they are unique. - - :return: The name of the temporary secondary substructure table. - """ - - db.cursor.execute("DROP TABLE IF EXISTS %s" % table_name) - - if minimum_frequency is None: - freq_statement = "" - else: - freq_statement = """ - AND smiles IN - (SELECT smiles - FROM hmdbid_substructures - GROUP BY smiles - HAVING COUNT(*) >= {}) - """.format(minimum_frequency,) - - if ha_min is None: - ha_min_statement = "" - else: - ha_min_statement = """ - AND heavy_atoms >= %s""" % str(ha_min) - - if ha_max is None: - ha_max_statement = "" - else: - ha_max_statement = """ - AND heavy_atoms <= %s""" % str(ha_max) - - if max_mass is None: - max_mass_statment = "" - else: - max_mass_statment = """ - AND exact_mass__1 < %s""" % str(max_mass) - - db.cursor.execute("""CREATE TABLE {} AS - SELECT * - FROM substructures - WHERE atoms_available <= {} - AND valence <= {}{}{}{}{} - """.format(table_name, - max_atoms_available, - max_degree, - max_mass_statment, - freq_statement, - ha_min_statement, - ha_max_statement)) - - db.create_indexes(table=table_name, selection="gen_subs_table") - - return table_name - - -def build_from_subsets(exact_subset, mf, table_name, db): - """ - A stage of the :py:meth:`metaboblend.build_structures.build` workflow for generating molecules to a given mass - from substructures. At this stage, mass subsets have been identified in the substructure database. Each of these - groups are now filtered further by identifying masses that refer to valid subsets of molecules, before they are - built to generate new molecules. - - :param db: The substructure and connectivity database. Elemental compositions and substructures are retrieved from - the database; this information is listed as "substructure_subset" and will be appended to the - substructure_subsets list provided as a parameter. - - :param exact_subset: Group of masses that sum to the correct total mass, refer to substructures in the substructure - database. - - :param mf: List of integers detailing the molecular composition of the target metabolite, in the format - `[C, H, N, O, P, S]`. - - :param table_name: The name of the table within the substructure database from which to extract substructures. A - prefiltered table based on the parameters specified in :py:meth:`metaboblend.build_structures.build`. See - :py:meth:`metaboblend.build_structures.gen_subs_table`. - """ - - substructure_subsets = [] - - mf_subset = combine_mfs(exact_subset, db, table_name, "0_0001") - - if len(mf_subset) == 0: - return [] - - for ec_product in itertools.product(*mf_subset): - - if mf != list(numpy.array(ec_product).sum(axis=0)): - continue # check each set of elemental compositions matches the target mol - - substructure_subset = db.select_substructures(ec_product, table_name) - - if len(substructure_subset) == 0: - continue - - substructure_subsets.append(substructure_subset) - - return substructure_subsets - - -def get_bond_enthalpies(): - - return {1.0: {'C': {'C': 348, 'N': 305, 'O': 360, 'P': 264, 'S': 272}, - 'N': {'C': 305, 'N': 163, 'O': 222, 'P': None, 'S': None}, - 'O': {'C': 360, 'N': 222, 'O': 146, 'P': 335, 'S': None}, - 'P': {'C': 264, 'N': None, 'O': 335, 'P': 201, 'S': None}, - 'S': {'C': 272, 'N': None, 'O': None, 'P': None, 'S': 226}}, - 1.5: {'C': {'C': 837, 'N': 890, 'O': None, 'P': None, 'S': None}, - 'N': {'C': 890, 'N': 944, 'O': None, 'P': None, 'S': None}, - 'O': {'C': None, 'N': None, 'O': None, 'P': None, 'S': None}, - 'P': {'C': None, 'N': None, 'O': None, 'P': None, 'S': None}, - 'S': {'C': None, 'N': None, 'O': None, 'P': None, 'S': None}}, - 2.0: {'C': {'C': 612, 'N': 613, 'O': 743, 'P': None, 'S': 573}, - 'N': {'C': 613, 'N': 409, 'O': 607, 'P': None, 'S': None}, - 'O': {'C': 743, 'N': 607, 'O': 496, 'P': 544, 'S': 522}, - 'P': {'C': None, 'N': None, 'O': 544, 'P': None, 'S': 335}, - 'S': {'C': 573, 'N': None, 'O': 522, 'P': 335, 'S': 425}}} - - -def substructure_combination_build(substructure_subset, configs_iso, prescribed_structure, isomeric_smiles, - bond_enthalpies, retain_substructures): - """ - Final stage for building molecules; takes a combination of substructures (substructure_combination) and builds them - according to graphs in the substructure database. May be run in parallel. - - :param substructure_subset: Combinations of substructures for building mols. - - :param configs_iso: Possible substructure combinations extracted from the connectivity database. A tuple containing - tuples for each substructure; these tuples specify how many bonds each substructure can make. - - :param prescribed_structure: Prescribed fragment mass for building. - - :param isomeric_smiles: True/False, should output smiles be written with isomeric information? - - :param bond_enthalpies: Dictionary of bond enthalpies, as generated by - :py:meth:`metaboblend.build_structures.get_bond_enthalpies`. - - :param retain_substructures: Whether to record the substructures used to generate final structures. - - :return: List of smiles representing molecules generated (and the substructures used to generate them). - """ - - smis = {} - - for substructure_combination in itertools.product(*substructure_subset): - substructure_combination[0]["fragment"] = True - substructure_combination = sorted(substructure_combination, key=itemgetter('atoms_available', 'valence')) - - v_a = () - if prescribed_structure is not None: - fragment_indexes = [] - j = -1 - for i, d in enumerate(substructure_combination): - v_a += (tuple(d["degree_atoms"].values()),) # obtain valence configuration of the set of substructures - - for atom_available in tuple(d["degree_atoms"].values()): - j += 1 - - try: - if prescribed_structure is not None and d["fragment"]: - fragment_indexes.append(j) - except KeyError: - continue - - if str(v_a) not in configs_iso: # check mols "fit" together according to the connectivity database - continue - - mol_comb, atoms_available, atoms_to_remove, bond_types, bond_mismatch = reindex_atoms(substructure_combination) - - if bond_mismatch: - continue # check that bond types are compatible (imperfect check) - - for edges in configs_iso[str(v_a)]: # build mols for each graph in connectivity db - if prescribed_structure is not None: - non_fragment_edges = False - - for edge in edges: # check that edges only connect to fragment ion - if edge[0] not in fragment_indexes and edge[1] not in fragment_indexes: - non_fragment_edges = True - - if non_fragment_edges: - continue - - mol_e, total_bde = add_bonds(mol_comb, edges, atoms_available, bond_types, bond_enthalpies) # add bonds between substructures - - if mol_e is None or total_bde is None: - continue - - atoms_to_remove.sort(reverse=True) - [mol_e.RemoveAtom(a) for a in atoms_to_remove] # clean up dummy atoms - - mol_out = mol_e.GetMol() # generate the final (non-editable) mol - - try: - Chem.SanitizeMol(mol_out) # clean the mol - ensure it is valid & canonical - except: - continue - - try: # append the canonical smiles of the final structure - final_structure = Chem.MolToSmiles(mol_out, isomericSmiles=isomeric_smiles) - except RuntimeError: - continue # bad bond type violation - - final_substructures = [subs["smiles"] for subs in substructure_combination] - - try: - smis[final_structure]["bdes"].append(total_bde) - - if retain_substructures: - smis[final_structure]["substructures"].append(final_substructures) - - except KeyError: - smis[final_structure] = {"bdes": [total_bde]} - - if retain_substructures: - smis[final_structure]["substructures"] = [final_substructures] - - return smis diff --git a/metaboblend/build_structures/__init__.py b/metaboblend/build_structures/__init__.py new file mode 100644 index 0000000..8a2d78a --- /dev/null +++ b/metaboblend/build_structures/__init__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# diff --git a/metaboblend/build_structures/annotate.py b/metaboblend/build_structures/annotate.py new file mode 100644 index 0000000..ee1fd4a --- /dev/null +++ b/metaboblend/build_structures/annotate.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import os +from typing import Sequence, Dict, Union + +from metaboblend.parse import parse_ms_data +from metaboblend.databases.results import ResultsDb +from metaboblend.databases.substructures import SubstructureDb +from metaboblend.build_structures.build import build, gen_subs_table, get_possible_fragment_ions + + +def annotate_msn(msn_data: Union[str, os.PathLike, Dict[str, Dict[str, Union[int, list]]]], + path_substructure_db: Union[str, bytes, os.PathLike] = os.path.realpath(os.getcwd()), + path_out: Union[str, bytes, os.PathLike] = "", + ppm: int = 5, + ha_min: Union[int, None] = None, + ha_max: Union[int, None] = None, + max_atoms_available: int = 2, + max_degree: int = 4, + max_n_substructures: int = 3, + max_bde: Union[float, None] = 2000, + path_connectivity_db: Union[str, bytes, os.PathLike, None] = None, + ncpus: Union[int, None] = None, + minimum_frequency: Union[int, None] = None, + hydrogenation_allowance: int = 2, + yield_smis: bool = True, + isomeric_smiles: bool = False, + write_csv_output: bool = True, + retain_substructures: bool = False, + abs_error_peak: float = 0.001, + abs_error_precursor: float = 0.0005, + ) -> Dict[str, Sequence[Dict[str, int]]]: + """ + Generate molecules of a given mass using chemical substructures, connectivity graphs and spectral trees or + fragmentation spectra. Final structures and rankings are yielded by the function as a dictionary and/or written in + text format. For the generation of structures without MSn data, see + :py:meth:`metaboblend.build_structures.generate_structures`. + + :param msn_data: Either a dictionary or the path to an MSP file. MSP files are parsed by + :py:meth:`metaboblend.parse.parse_ms_data` before being converted into a dictionary. If a dictionary is + provided, it must contain one item per fragmentation spectrum; the keys of the dictionary should be a unique ID + for the query and the corresponding value must itself be a dictionary, containing the following: + + - "exact_mass": `float` (neutral mass of query) OR "precursor_mz": `float` (mz of precursor ion) + - "mf": `[C, H, N, O, P, S]` (a list of 6 integers) + - "neutral_fragment_masses": `[float, float, ...]` (list of neutral fragment masses) OR "fragment_mzs": + `[float, float, ...]` (list of fragment mzs) + - "precursor_type": `str` (e.g. "[M+H]+", required for calculating neutral masses from ion mzs) + + The dictionary or MSP path is fed to :py:meth:`metaboblend.parse.parse_ms_data`. + + :param path_substructure_db: The path to the SQLite 3 substructure database, as generated by + :py:meth:`metaboblend.databases.SubstructureDb`. + + :param path_out: Folder to which the SQLite 3 results database and CSV outputs should be written. + + :param ppm: The maximal tolerated m/z deviation (in parts per million) of the mass of substructures from the + supplied `fragment_masses`. + + :param ha_min: The minimum size (number of heavy atoms) of substructures to be used to build final structures. If + None, no limit is applied. + + :param ha_max: The maximum size (number of heavy atoms) of substructures to be used to build final structures. If + None, no limit is applied. + + :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for + building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming + chemical bonds (e.g. single or double bonds). `atoms_available` are also limited by the extensivity of the + supplied connectivity database. + + :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We + define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, + 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For + instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have + a total degree of 3. + + :param max_n_substructures: The maximum number of substructures to be used for building molecules. The max number + of substructures is also limited by the extensivity of the supplied connectivity database. + + :param path_connectivity_db: The path to the SQLite 3 connectivity database, as generated by + :py:meth:`metaboblend.databases.create_isomorphism_database`. If the path is None, the default connectivity + database bundled with MetaboBlend will be used. + + :param ncpus: How many CPUs to utilise; if left as None, :py:meth:`os.cpu_count` is used. + + :param minimum_frequency: The minimum frequency of substructures in table_name; e.g. substructures have a frequency + of 1 if they are unique. Defaults to None, in which case this filtering method is not applied. + + :param hydrogenation_allowance: In order to represent re-arrangement events (the movement of hydrogens), in + addition to attempting to build from substructures in prescribed_masses, we also attempt to build from + `fragment_masses +- hydrogenation_allowance`. E.g. if `prescribed_masses = [141.5938]` and + `hydrogenation_allowance = 1` then, to find candidate fragment substructures, we use as query the masses + `[141.5938 - 1.007825, 141.5938, 141.5938 + 1.007825]`. + + :param yield_smis: If True, for each input molecule the function yields SMILEs the number of `fragment_masses` by + which the structure was generated. Else, returns None. + + :param isomeric_smiles: If True, writes smiles with non-structural isomeric information. + + :param write_csv_output: Whether to extract results from the SQLite3 database for deposition in CSV files. + + :param retain_substructures: Whether to record the substructures used to generate final structures. + + :param abs_error_peak: Allowable absolute mz deviation from MSn peaks. + + :param abs_error_precursor: Allowable absolute mz deviation of final structure from precursor mass. + + :return: For each input molecule yields a dictionary whose keys are SMILEs strings for the generated + structures and values are the number of `fragment_masses` by which the structure was built (unless + `yield_smi_dict = False`). + """ + + if ppm is None: + ppm = 0 + + if path_connectivity_db is None: + path_connectivity_db = os.path.join(os.path.realpath(os.path.dirname(__file__)), "../data", + "connectivity.sqlite") + + db = SubstructureDb(path_substructure_db, path_connectivity_db) + results_db = ResultsDb(path_out, retain_substructures=retain_substructures) + results_db.create_results_db() + + # prepare temporary table here - will only be generated once in case of multiple input + table_name = gen_subs_table( + db=db, + ha_min=ha_min, + ha_max=ha_max, + max_degree=max_degree, + max_atoms_available=max_atoms_available, + minimum_frequency=minimum_frequency, + max_mass=None + ) + + for i, ms in enumerate(parse_ms_data(msn_data)): + + if ms is None: + continue + + results_db.add_ms(msn_data, ms["ms_id"], i, [ppm, ha_min, ha_max, max_atoms_available, max_degree, + max_n_substructures, hydrogenation_allowance, isomeric_smiles]) + + for j, fragment_mass in enumerate(ms["neutral_fragment_masses"]): + + # start off by getting the substructures that could represent the fragment ion + possible_fragment_ions = get_possible_fragment_ions(fragment_mass, db, hydrogenation_allowance, ppm, + abs_error_peak, table_name) + + smi_dict = build( + db=db, + mf=ms["mf"], + exact_mass=ms["exact_mass"], + max_n_substructures=max_n_substructures, + prescribed_substructures=possible_fragment_ions, + ppm=ppm, + table_name=table_name, + ncpus=ncpus, + isomeric_smiles=isomeric_smiles, + tolerance=abs_error_precursor, + max_bde=max_bde + ) + + results_db.add_results(i, smi_dict, fragment_mass, j) + smi_dict = None + + results_db.calculate_scores(i) + + if yield_smis: + yield {ms["ms_id"]: results_db.get_structures(i)} + + if write_csv_output: + results_db.generate_csv_output() + + db.close() + results_db.close() + + +def generate_structures(ms_data: Union[str, os.PathLike, Dict[str, Dict[str, Union[int, None]]]], + path_substructure_db: Union[str, bytes, os.PathLike], + path_out: Union[str, bytes, os.PathLike] = os.path.realpath(os.getcwd()), + ha_min: Union[int, None] = 2, + ha_max: Union[int, None] = 9, + max_degree: int = 6, + max_atoms_available: int = 2, + max_n_substructures: int = 3, + max_bde: Union[float, None] = None, + ncpus: Union[int, None] = None, + path_connectivity_db: Union[str, bytes, os.PathLike, None] = None, + minimum_frequency: Union[int, None] = None, + yield_smis: bool = True, + isomeric_smiles: bool = False, + write_csv_output: bool = True, + retain_substructures: bool = False + ) -> Dict[str, Sequence[set]]: + """ + Generate molecules of a given mass using chemical substructures and connectivity graphs. Can optionally take a + "prescribed" fragment mass to further filter results. Final structures are returned as a list and/or written in + text format. For the generation of structures from MSn data, see + :py:meth:`metaboblend.build_structures.annotate_msn`. + + :param ms_data: A dictionary that must contain one item per fragmentation spectrum; the keys of the dictionary + should be a unique ID for the query and the corresponding value must itself be a dictionary, containing the + following: + + - "exact_mass": `float` (neutral mass of query) OR "precursor_mz": `float` (mz of precursor ion) + - "mf": `[C, H, N, O, P, S]` (a list of 6 integers) + - "precursor_type": `str` (e.g. "[M+H]+", required for calculating neutral masses from ion mzs) + - (optional) "prescribed_mass": 'float' (neutral mass of substructure). + + The dictionary or MSP path is fed to :py:meth:`metaboblend.parse.parse_ms_data`. A single neutral substructure + mass may be provided ("prescribed_mass") to guide the structure generation process. + + :param path_substructure_db: The path to the SQLite 3 substructure database, as generated by + :py:meth:`metaboblend.databases.SubstructureDb`. + + :param path_out: Folder to which the SQLite 3 results database and CSV outputs should be written. + + :param ha_min: The minimum size (number of heavy atoms) of substructures to be used to build final structures. If + None, no limit is applied. + + :param ha_max: The maximum size (number of heavy atoms) of substructures to be used to build final structures. If + None, no limit is applied. + + :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We + define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, + 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For + instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have + a total degree of 3. + + :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for + building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming + chemical bonds (e.g. single or double bonds). `atoms_available` are also limited by the extensivity of the + supplied connectivity database. + + :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We + define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, + 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For + instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have + a total degree of 3. + + :param max_n_substructures: The maximum number of substructures to be used for building molecules. The max number + of substructures is also limited by the extensivity of the supplied connectivity database. + + :param path_connectivity_db: The path to the SQLite 3 connectivity database, as generated by + :py:meth:`metaboblend.databases.create_isomorphism_database`. If the path is None, the default connectivity + database bundled with MetaboBlend will be used. + + :param ncpus: How many worker processes to utilise; if left as None, :py:meth:`os.cpu_count` is used. + + :param minimum_frequency: The minimum frequency of substructures in table_name; e.g. substructures have a frequency + of 1 if they are unique. Defaults to None, in which case this filtering method is not applied. + + :param yield_smis: If True, yields a set of unique SMILEs string for each input molecule, else returns None. + + :param isomeric_smiles: If True, writes smiles with non-structural isomeric information. + + :param write_csv_output: Whether to extract results from the SQLite3 database for deposition in CSV files. + + :param retain_substructures: Whether to record the substructures used to generate final structures. + + :return: For each input molecule, yields unique SMILEs strings (unless `yield_smis = False`). + """ + + if path_connectivity_db is None: + path_connectivity_db = os.path.join(os.path.realpath(os.path.dirname(__file__)), "../data", + "connectivity.sqlite") + + db = SubstructureDb(path_substructure_db, path_connectivity_db) + + results_db = ResultsDb(path_out, False, retain_substructures=retain_substructures) + results_db.create_results_db() + + # prepare temporary table here - will only be generated once in case of multiple input + table_name = gen_subs_table( + db=db, + ha_min=ha_min, + ha_max=ha_max, + max_degree=max_degree, + max_atoms_available=max_atoms_available, + minimum_frequency=minimum_frequency, + max_mass=round(max([ms_data[ms_id]["exact_mass"] for ms_id in ms_data.keys()])) + ) + + for i, ms in enumerate(parse_ms_data(ms_data, False)): + + results_db.add_ms(ms_data, ms["ms_id"], i, + [None, ha_min, ha_max, max_atoms_available, max_degree, max_n_substructures, None, + isomeric_smiles]) + + ppm = None + prescribed_substructures = None + + try: + if ms["prescribed_mass"] is not None: + ppm = 0 + prescribed_substructures = get_possible_fragment_ions(ms["prescribed_mass"], db, table_name=table_name) + + except KeyError: + ms["prescribed_mass"] = None + + smi_dict = build( + mf=ms["mf"], + exact_mass=ms["exact_mass"], + max_n_substructures=max_n_substructures, + prescribed_substructures=prescribed_substructures, + ppm=ppm, + table_name=table_name, + ncpus=ncpus, + isomeric_smiles=isomeric_smiles, + db=db, + tolerance=0.0001, + max_bde=max_bde + ) + + results_db.add_results(i, smi_dict, ms["prescribed_mass"]) + smi_dict = None + + results_db.calculate_scores(i) + + if yield_smis: + yield {ms["ms_id"]: results_db.get_structures(i)} + + if write_csv_output: + results_db.generate_csv_output() + + db.close() diff --git a/metaboblend/build_structures/build.py b/metaboblend/build_structures/build.py new file mode 100644 index 0000000..8fb33da --- /dev/null +++ b/metaboblend/build_structures/build.py @@ -0,0 +1,901 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import os +import copy +import numpy +import itertools +import multiprocessing +import networkx as nx +from functools import partial +from operator import itemgetter +from rdkit import Chem + +from metaboblend.algorithms import subset_sum + + +def combine_mfs(precise_mass_grp, db, table_name, accuracy): + """ + A wrapper for :py:meth:`metaboblend.databases.select_ecs` that instead takes a group of subsets, as generated by + the second stage of :py:meth:`metaboblend.build_structures.subset_sum` in + :py:meth:`metaboblend.build_structures.build`. + + :param precise_mass_grp: A list containing the masses of substructures identified by subset_sum. + + :param db: The :py:meth:`metaboblend.databases.SubstructureDb` in which to search for elemental compositions. + + :param table_name: The name of the table containing substructures in which to search for elemental compositions. + + :param accuracy: To which decimal places of accuracy results are to be limited to. + + * **1** Integer level + * **0_0001** Four decimal places + + :return: If there are no elemental compositions for any of the masses in the group, then an empty list is returned. + """ + + ecs = [] + + for i in range(len(precise_mass_grp)): + atoms = db.select_mfs(precise_mass_grp[i], table_name, accuracy) + + if len(atoms) == 0: + return [] + + ecs.append(atoms) + + return ecs + + +def reindex_atoms(records): + """ + Parses the libs of groups of substructures that are to be combined; the lib is a dictionary containing details + about the substructure, as generated by :py:meth:`metaboblend.databases.get_substructure`. Combines the + molecules into a single :py:meth:`rdkit.Chem.Mol` object and obtains details on their bonding properties. + + :param records: Takes a list of lib dictionaries that contain details on each substructure to be combined. + + :return: Returns a tuple containing a :py:meth:`rdkit.Chem.CombineMols` object, that stores all the substructures + as a single molecule, followed by information on the substructure bonding properties, including: + + * **atoms_available** A list of the indices of atoms that are available for bonding. + + * **atoms_to_remove** A list of the indices of dummy atoms that are to be removed in order to bond with other + substructures. + + * **bond_types** A dictionary containing the indices of atoms that are available for bonding as keys and values + detailing their bond types. See :py:meth:`metaboblend.build_structures.add_bonds`. + """ + + atoms_available, atoms_to_remove, index_atoms = [], [], [] + bond_types, all_bond_types = {}, {} + c = 0 + + for i, record in enumerate(records): + + idxs = [] + all_bond_types[i] = [] + + for atom in record["mol"].GetAtoms(): + + atom_idx = atom.GetIdx() + + new_idx = atom_idx + c + idxs.append(new_idx) + + if atom_idx in record["degree_atoms"]: + atoms_available.append(new_idx) + + if atom_idx in record["dummies"]: + atoms_to_remove.append(new_idx) + + if atom_idx in record["bond_types"]: + bond_types[new_idx] = record["bond_types"][atom_idx] + all_bond_types[i] += record["bond_types"][atom_idx] + + index_atoms.append(idxs) + c = idxs[-1] + 1 + + # check that bond types add up - removes some mismatched configurations + for i in range(len(records)): + other_bonds = [] + for j in range(len(records)): + if i != j: + other_bonds += all_bond_types[j] + + for bond in all_bond_types[i]: + if bond not in other_bonds: + return None, atoms_available, atoms_to_remove, bond_types, True + + mol_comb = Chem.Mol() + for record in records: + mol_comb = Chem.CombineMols(mol_comb, record["mol"]) + + return mol_comb, atoms_available, atoms_to_remove, bond_types, False + + +def add_bonds(mols, edges, atoms_available, bond_types, bond_enthalpies): + """ + Takes a set of substructures and attempts to combine them together to generate a final structure. One of the last + steps in the :py:meth:`metaboblend.build_structures.build` workflow. + + :param mols: A :py:meth:`rdkit.Chem.CombineMols` object, that stores all the substructures + as a single molecule. + + :param edges: The edges to use in order to join the substructures together, obtained from the connectivity database + (:py:meth:`metaboblend.databases.create_isomorphism_database`). + + :param atoms_available: A list of the indices of atoms that are available for bonding. + + :param bond_types: The type of bonds to be formed by dummy atoms - see :py:meth:`Chem.rdchem.BondType`. Is a + dictionary whose keys are atom indices and values are bond types, as follows: + + * **1.0** Single + + * **1.5** Aromatic + + * **2.0** Double + + :param bond_enthalpies: Dictionary of bond enthalpies, as generated by + :py:meth:`metaboblend.build_structures.get_bond_enthalpies`. + + :return: If unsuccessful, returns None, else returns an :py:meth:`rdkit.Chem.EditableMol` object containing + the substructures combined into a final single molecule. + """ + + rdkit_bond_types = {1: Chem.rdchem.BondType.SINGLE, + 1.5: Chem.rdchem.BondType.AROMATIC, + 2: Chem.rdchem.BondType.DOUBLE} + + bond_types_copy = copy.deepcopy(bond_types) # deep copy as we modify items within the dict + + g = nx.Graph() + g.add_edges_from(edges) + + g = nx.relabel_nodes(g, dict(zip(sorted(g.nodes()), atoms_available))) + + total_bde = 0 + + mol_edit = Chem.EditableMol(mols) + for edge in g.edges(): + + if edge[0] in bond_types_copy: + bt_start = bond_types_copy[edge[0]] + else: + return None, None # nested dummy + + if edge[1] in bond_types_copy: + bt_end = bond_types_copy[edge[1]] + else: + return None, None # nested dummy + + bond_matches = list(set(bt_start).intersection(bt_end)) + + if len(bond_matches) == 0: + return None, None + + bt_start.remove(bond_matches[0]) + bt_end.remove(bond_matches[0]) + + try: # try forming the specified bond + mol_edit.AddBond(edge[0], edge[1], rdkit_bond_types[bond_matches[0]]) + except KeyError: + return None, None # unknown bond type + + # calculate bond dissociation energy of "formed" bonds for the structure + try: + total_bde += bond_enthalpies[bond_matches[0]][mols.GetAtomWithIdx(edge[0]).GetSymbol()][mols.GetAtomWithIdx(edge[1]).GetSymbol()] + except (SyntaxError, TypeError): + total_bde = None + + return mol_edit, total_bde + + +def get_possible_fragment_ions(neutral_fragment_mass, db, hydrogenation_allowance=None, ppm=None, tolerance=None, table_name=None): + """ + Get possible fragment ions from a neutral mass. Either matches the mass exactly or does approximate matching + (within the allowable absolute or relative tolerance). + + :param neutral_fragment_mass: The neutral mass of the MS fragment. + + :param hydrogenation_allowance: Searches for substructures within `+-hydrogenation_allowance` hydrogen masses for + substructures. Substructures that are matched by the neutral fragment mass itself are considered "even" whereas + those matched by the modified masses are considered "odd" (i.e. non-standard hydrogen re-arrangements). In the + case of exact find, this field is left as None. + + :param db: Connection to the SQLite3 substructure database, :py:meth:`metaboblend.databases.SubstructureDb`. + + :param ppm: The relative (parts per million) tolerance for the neutral fragment mass. In the case of exact find, + this field is ignored. + + :param tolerance: The minimum absolute mz tolerance for the neutral fragment mass. In the case of exact find, this + field is ignored. + + :param table_name: Name of the substructures and substructure_ions tables to query. If None, uses the main tables. + + :return: A dictionary in the format: + {integer_mass (int): {exact_mass (float): {substructure_id (int): substructure (dict)}}}. + """ + + if table_name is None: + subs_table_name = "substructures" + subs_ions_table_name = "substructure_ions" + else: + subs_table_name = table_name + "_substructures" + subs_ions_table_name = table_name + "_substructure_ions" + + fragments = {} + + if hydrogenation_allowance is None: # exact find + + exact_mass__1 = round(neutral_fragment_mass, 0) + exact_mass_0_0001 = round(neutral_fragment_mass, 4) + + fragments[exact_mass__1] = {exact_mass_0_0001: {}} + + db.cursor.execute("""SELECT smiles, mol, bond_types, valence_atoms, valence, atoms_available, dummies, + C, H, N, O, P, S, substructure_id + FROM {} + WHERE exact_mass__0_0001 = {} + """.format(subs_table_name, exact_mass_0_0001)) + + for record in db.cursor.fetchall(): + + substructure = { + "smiles": record[0], + "mol": Chem.Mol(record[1]), + "bond_types": eval(record[2]), + "degree_atoms": eval(record[3]), + "valence": record[4], + "atoms_available": record[5], + "dummies": eval(record[6]), + "even": True, + "mf": (record[7], record[8], record[9], record[10], record[11], record[12],) + } + + fragments[exact_mass__1][exact_mass_0_0001][record[13]] = substructure + + else: # approximate find +- hydrogens + + for i in range(0 - hydrogenation_allowance, hydrogenation_allowance + 1): + + hydrogenated_fragment_mass = neutral_fragment_mass + (i * 1.007825) # consider non-standard re-arrangements + + if ((hydrogenated_fragment_mass / 1000000) * ppm) > tolerance: + fragment_tolerance = round((hydrogenated_fragment_mass / 1000000) * ppm, 4) + else: + fragment_tolerance = tolerance + + db.cursor.execute("""SELECT smiles, mol, bond_types, valence_atoms, valence, atoms_available, dummies, + exact_mass__0_0001, exact_mass__1, C, H, N, O, P, S, unmodified_substructures.substructure_id, + modified_masses.modified_exact_mass__0_0001 + FROM {} AS unmodified_substructures + LEFT JOIN {} AS modified_masses ON unmodified_substructures.substructure_id = modified_masses.substructure_id + WHERE modified_masses.modified_exact_mass__0_0001 > {} AND modified_masses.modified_exact_mass__0_0001 < {} + """.format( + subs_table_name, + subs_ions_table_name, + hydrogenated_fragment_mass - fragment_tolerance, + hydrogenated_fragment_mass + fragment_tolerance + )) + + for record in db.cursor.fetchall(): + + substructure = { + "smiles": record[0], + "mol": Chem.Mol(record[1]), + "bond_types": eval(record[2]), + "degree_atoms": eval(record[3]), + "valence": record[4], + "atoms_available": record[5], + "dummies": eval(record[6]), + "even": i == 0, + "mf": (record[9], record[10], record[11], record[12], record[13], record[14],), + "ppm_error": abs(((hydrogenated_fragment_mass - record[16]) / hydrogenated_fragment_mass) * 1000000) + } + + if record[8] not in fragments.keys(): + fragments[record[8]] = {record[7]: {record[15]: substructure}} + + elif record[7] not in fragments[record[8]].keys(): + fragments[record[8]][record[7]] = {record[15]: substructure} + + # if hydrogenation modifications have been made (i.e. i != 0) do not modify existing record + elif record[15] not in fragments[record[8]][record[7]].keys() or i == 0: + fragments[record[8]][record[7]][record[15]] = substructure + + return fragments + + +def build(db, mf, exact_mass, max_n_substructures, prescribed_substructures, ppm, ncpus, table_name, isomeric_smiles, + tolerance, max_bde): + """ + Core function for generating molecules of a given mass using substructures and connectivity graphs. Can optionally + take a "prescribed" fragment mass to further filter results; this can be used to incorporate MSn data. Final + molecules are written to the specified file and/or returned in smiles format. This function represents the central + building method used by MetaboBlend; :py:meth:`metaboblend.build_structures.generate_structures` provides a simple + interface for generating structures of a given mass whilst + :py:meth:`metaboblend.build_structures.annotate_msn` allows for the generation and scoring of structures using + information from fragmentation spectra. + + :param db: Connection to the SQLite3 substructure database, :py:meth:`metaboblend.databases.SubstructureDb`. + + :param mf: List of integers detailing the molecular formula of the target metabolite, in the format + [C, H, N, O, P, S]. + + :param exact_mass: The exact mass (float) of the target metabolite. + + :param max_n_substructures: The maximum number of substructures to be used for building molecules. + + :param prescribed_substructures: Substructures by which to filter results, as generated by + :py:meth:`metaboblend.build_structures.get_possible_fragment_ions`. + + :param ppm: The maximal tolerated m/z deviation (in parts per million) of the mass of substructures from the + supplied `fragment_masses`. + + :param ncpus: How many CPUs to utilise; if left as None, :py:meth:`os.cpu_count` is used. + + :param table_name: The table specified within the substructure database will be used to generate + molecules. Will be removed after structures have been built, unless 'clean = False' is set. + + :param isomeric_smiles: If True, writes smiles with non-structural isomeric information. + + :param tolerance: Minimum absolute mz tolerance for the fragment and precursor masses. Only used if + `prescribed_substructures` is not None. + + :return: Returns a set of unique SMILEs strings. + """ + + configs_iso = db.k_configs() + + # select groups of masses at low mass resolution + integer_mass_values = [m for m in db.select_mass_values("1", [], table_name) if m <= round(exact_mass, 0)] + + if os.name == "nt": # multiprocessing freeze support on windows + multiprocessing.freeze_support() + + substructure_subsets = [] + + if prescribed_substructures is None: # standard build method, does not require knowledge of a substructure of the target metabolite + substructure_subsets = refine_masses_standard(substructure_subsets, mf, exact_mass, integer_mass_values, max_n_substructures, table_name, db) + + else: # MSn build method - requires a list of possible substructures (prescribed_substructures) + + substructure_subsets = refine_masses_prescribed(substructure_subsets, mf, exact_mass, prescribed_substructures, ppm, integer_mass_values, max_n_substructures - 1, table_name, db, tolerance) + + with multiprocessing.Pool(processes=ncpus) as pool: # send sets of substructures for building + smi_dicts = pool.map( + partial(substructure_combination_build, configs_iso=configs_iso, + prescribed_method=prescribed_substructures is not None, isomeric_smiles=isomeric_smiles, + bond_enthalpies=get_bond_enthalpies(), max_bde=max_bde), + substructure_subsets + ) + + substructure_subsets = None + + # recombine the output of pool.map into a single dictionary + smi_dict = {} + for d in smi_dicts: + for k in d.keys(): + try: + smi_dict[k]["bde"] += d[k]["bde"] + smi_dict[k]["valence"] += d[k]["valence"] + smi_dict[k]["substructures"] += d[k]["substructures"] + + if prescribed_substructures is not None: + smi_dict[k]["even"] += d[k]["even"] + smi_dict[k]["ppm_error"] += d[k]["ppm_error"] + + except KeyError: + smi_dict[k] = d[k] + + return smi_dict + + +def refine_masses_standard(substructure_subsets, mf, exact_mass, integer_mass_values, max_n_substructures, table_name, + db): + """ + Takes a set of masses and applies :py:meth:`metaboblend.algorithms.subset_sum`. Generates a list of subsets of + substructures to be combined into candidate target metabolites. + + :param substructure_subsets: List of substructure subsets to be filled, usually empty. + + :param integer_mass_values: List of possible integer masses for all valid substructures. + + :param max_n_substructures: The maximum number of substructures to be combined. + + :param table_name: The name of the table from which to extract substructures - if None, searches the main tables. + + :param db: Connection to the SQLite3 substructure database, :py:meth:`metaboblend.databases.SubstructureDb`. + + :param mf: List of integers detailing the molecular formula of the target metabolite, in the format + [C, H, N, O, P, S]. + + :param exact_mass: The exact mass (float) of the target metabolite. + + :param max_n_substructures: The maximum number of substructures to be used for building molecules. + + :return: Returns a list of lists - one list for each substructure subset. Each substructure subset list contains + a list for each + """ + + integer_subsets = list(subset_sum(integer_mass_values, int(round(exact_mass, 0)), max_n_substructures)) + + for integer_subset in integer_subsets: + + if len(integer_subset) > max_n_substructures or len(integer_subset) == 0: + continue + + # refine groups of masses to 4dp mass resolution + exact_mass_values = db.select_mass_values("0_0001", integer_subset, table_name) + + # use combinations to get second group of masses instead of subset sum - subset sum is integer mass only + exact_subsets = [] + for mass_combo in itertools.product(*exact_mass_values): + if round(sum(mass_combo), 4) == round(exact_mass, 4): + exact_subsets.append(mass_combo) + + if len(exact_subsets) == 0: + continue + + # refines groups based on ecs and gets substructures from db (appends to substructure_subsets) + for exact_subset in exact_subsets: + substructure_subsets += build_from_subsets(exact_subset, mf=mf, table_name=table_name, db=db) + + return substructure_subsets + + +def refine_masses_prescribed(substructure_subsets, mf, exact_mass, prescribed_substructures, ppm, integer_mass_values, + max_n_substructures, table_name, db, tolerance): + """ + Takes a set of masses and applies :py:meth:`metaboblend.algorithms.subset_sum`. Generates a list of subsets of + substructures to be combined with possible fragment substructures to generate candidate target metabolites. + + :param prescribed_substructures: Substructures that may represent the neutral fragment structure, as retrieved + by :py:meth:`metaboblend.build_structures.get_possible_fragment_ions`. + + :param substructure_subsets: List of substructure subsets to be filled, usually empty. + + :param integer_mass_values: List of possible integer masses for all valid substructures. + + :param max_n_substructures: The maximum number of substructures to be combined. + + :param table_name: The name of the table from which to extract substructures - if None, searches the main tables. + + :param db: Connection to the SQLite3 substructure database, :py:meth:`metaboblend.databases.SubstructureDb`. + + :param mf: List of integers detailing the molecular formula of the target metabolite, in the format + [C, H, N, O, P, S]. + + :param exact_mass: The exact mass (float) of the target metabolite. + + :param max_n_substructures: The maximum number of substructures to be used for building molecules. Note that this + does not include the fragment substructure itself. + + :param ppm: The maximal tolerated m/z deviation (in parts per million) of the mass of substructures from the + supplied `fragment_masses`. + + :param tolerance: Minimum absolute mz tolerance for the fragment and precursor masses. Only used if + `prescribed_substructures` is not None. + + :return: Returns a list of lists - one list for each substructure subset. Each substructure subset list contains + a list for each + """ + + # for each fragment peak in the MS2 spectrum + for fragment_mass__1 in prescribed_substructures.keys(): + + # work out the corresponding neutral loss + loss_mass__1 = int(round(round(exact_mass, 0) - fragment_mass__1, 0)) + + if ((exact_mass / 1000000) * ppm) > tolerance: + tolerance = round((exact_mass / 1000000) * ppm, 4) + + if len(integer_mass_values) == 0: + return {} + + # get subsets of masses at integer level that could build up to the loss mass (i.e. can be combined with fragment substructure(s)) + integer_subsets = list(subset_sum(integer_mass_values, loss_mass__1, max_n_substructures)) + + for integer_subset in integer_subsets: + + if len(integer_subset) > max_n_substructures or len(integer_subset) == 0: + continue + + # refine groups of (loss) masses to 4dp mass resolution + exact_mass_values = db.select_mass_values("0_0001", integer_subset, table_name) + + for fragment_mass_0_0001 in prescribed_substructures[fragment_mass__1].keys(): + + # use combinations to get second group of masses instead of subset sum - subset sum is integer mass only + exact_subsets = [] + for mass_combo in itertools.product(*exact_mass_values): + if abs((fragment_mass_0_0001 + sum(mass_combo)) - exact_mass) <= tolerance: + exact_subsets.append(mass_combo) + + if len(exact_subsets) == 0: + continue + + # refines groups based on ecs and gets substructures from db (appends to substructure_subsets) + mf_to_substructure_id = {} + for substructure_id in prescribed_substructures[fragment_mass__1][fragment_mass_0_0001].keys(): + try: + mf_to_substructure_id[tuple(prescribed_substructures[fragment_mass__1][fragment_mass_0_0001][substructure_id]["mf"])].append(substructure_id) + except KeyError: + mf_to_substructure_id[tuple(prescribed_substructures[fragment_mass__1][fragment_mass_0_0001][substructure_id]["mf"])] = [substructure_id] + + for fragment_mf in mf_to_substructure_id.keys(): + for exact_subset in exact_subsets: + + loss_mf = [atom - fragment_atom for atom, fragment_atom in zip(mf, fragment_mf)] + + substructure_subsets += build_from_subsets( + exact_subset, + mf=loss_mf, + table_name=table_name, + db=db, + fragment_substructures=[prescribed_substructures[fragment_mass__1][fragment_mass_0_0001][substructure_id] for substructure_id in mf_to_substructure_id[fragment_mf]] + ) + + return substructure_subsets + + +def gen_subs_table(db, ha_min, ha_max, max_degree, max_atoms_available, max_mass, table_name="subset", + minimum_frequency=None): + """ + Generate a temporary secondary substructure table restricted by a set of parameters. Generated as an initial step + in :py:meth:`metaboblend.build_structures.generate_structures` and + :py:meth:`metaboblend.build_structures.annotate_msn` in order to limit the processing overhead as a result of + repeatedly querying the SQLite substructure database. + + :param max_mass: The maximum allowed mass of substructures in the temporary table; there is no point considering + substructures with greater mass than the target mol. + + :param db: Connection to a :py:meth:`metaboblend.databases.SubstructureDb` from which to extract substructures. + + :param ha_min: Minimum value of `heavy_atoms` for substructures to be transferred into the temporary table. + If None, no limit is applied. + + :param ha_max: Maximum value of `heavy_atoms` for substructures to be transferred into the temporary table. + If None, no limit is applied. + + :param max_degree: The maximum total valence (ie, the product of `atoms_available` and the degree of their bonds) + to be included in the temporary table. + + :param max_atoms_available: The maximal atoms available of substructures to be included in the temporary table. + + :param table_name: Defaults to "subset_substructures", which is cleaned up upon database closure. The name of the + table to be generated + + :param minimum_frequency: The minimum frequency of substructures in table_name; e.g. substructures have a frequency + of 1 if they are unique. + + :return: The name of the temporary secondary substructure table. + """ + + db.cursor.execute("DROP TABLE IF EXISTS %s" % (table_name + "_substructures")) + db.cursor.execute("DROP TABLE IF EXISTS %s" % (table_name + "_substructure_ions")) + + if minimum_frequency is None: + freq_statement = "" + else: + freq_statement = """ + AND smiles IN + (SELECT smiles + FROM hmdbid_substructures + GROUP BY smiles + HAVING COUNT(*) >= {}) + """.format(minimum_frequency,) + + if ha_min is None: + ha_min_statement = "" + else: + ha_min_statement = """ + AND heavy_atoms >= %s""" % str(ha_min) + + if ha_max is None: + ha_max_statement = "" + else: + ha_max_statement = """ + AND heavy_atoms <= %s""" % str(ha_max) + + if max_mass is None: + max_mass_statement = "" + else: + max_mass_statement = """ + AND exact_mass__1 < %s""" % str(max_mass) + + db.temporary_table_names.append(table_name + "_substructures") + + db.cursor.execute("""CREATE TABLE {} ( + substructure_id INTEGER PRIMARY KEY, + smiles TEXT NOT NULL UNIQUE, + heavy_atoms INTEGER, + length INTEGER, + exact_mass__1 INTEGER, + exact_mass__0_0001 REAL, + exact_mass REAL, + C INTEGER, + H INTEGER, + N INTEGER, + O INTEGER, + P INTEGER, + S INTEGER, + valence INTEGER, + valence_atoms TEXT, + atoms_available INTEGER, + bond_types TEXT, + dummies TEXT, + mol BLOB) + """.format( + table_name + "_substructures", + table_name + "_substructure_ions" + )) + + db.cursor.execute("""INSERT INTO {} + SELECT * + FROM substructures + WHERE atoms_available <= {} + AND valence <= {}{}{}{}{} + """.format( + table_name + "_substructures", + max_atoms_available, + max_degree, + max_mass_statement, + freq_statement, + ha_min_statement, + ha_max_statement + )) + + db.temporary_table_names.append(table_name + "_substructure_ions") + + db.cursor.execute("""CREATE TABLE {}_substructure_ions ( + substructure_id INTEGER, + hydrogen_modification INTEGER, + ion_mode_positive BOOLEAN, + modified_exact_mass__1 INTEGER, + modified_exact_mass__0_0001 REAL, + PRIMARY KEY (substructure_id, hydrogen_modification, ion_mode_positive), + FOREIGN KEY (substructure_id) REFERENCES {}_substructures(substructure_id)) + """.format(table_name, table_name)) + + db.cursor.execute("""INSERT INTO {} + SELECT * + FROM substructure_ions + WHERE substructure_id IN (SELECT substructure_id FROM {}) + """.format( + table_name + "_substructure_ions", + table_name + "_substructures" + )) + + db.conn.commit() + db.create_temp_indexes(table_name) + + return table_name + + +def build_from_subsets(exact_subset, mf, table_name, db, fragment_substructures=None): + """ + A stage of the :py:meth:`metaboblend.build_structures.build` workflow for generating molecules to a given mass + from substructures. At this stage, mass subsets have been identified in the substructure database. Each of these + groups are now filtered further by identifying masses that refer to valid subsets of molecules, before they are + built to generate new molecules. + + :param db: The substructure and connectivity database. Elemental compositions and substructures are retrieved from + the database; this information is listed as "substructure_subset" and will be appended to the + substructure_subsets list provided as a parameter. + + :param exact_subset: Group of masses that sum to the correct total mass, refer to substructures in the substructure + database. + + :param mf: List of integers detailing the molecular composition of the target metabolite, in the format + `[C, H, N, O, P, S]`. + + :param table_name: The name of the table within the substructure database from which to extract substructures. A + prefiltered table based on the parameters specified in :py:meth:`metaboblend.build_structures.build`. See + :py:meth:`metaboblend.build_structures.gen_subs_table`. + + :param fragment_substructures: If None, standard building from the input mass subset is carried out. Else, + represents the retrieved candidate fragment substructures to be combined with the substructures of the input + mass subset. + + :return: Returns a list in the same format as the input mass subset, `exact_subset`. Instead of masses (floats), + the substructures are now represented by dictionaries, as retreived by + :py:meth:`metaboblend.build_structures.get_possible_fragment_ions`. + """ + + substructure_subsets = [] + mf_subset = combine_mfs(exact_subset, db, table_name, "0_0001") + + if len(mf_subset) == 0: + return [] + + for ec_product in itertools.product(*mf_subset): + + if mf != list(numpy.array(ec_product).sum(axis=0)): + continue # check each set of elemental compositions matches the target mol + + substructure_subset = db.select_substructures(ec_product, table_name) + + if len(substructure_subset) == 0: + continue + + if fragment_substructures is None: + substructure_subsets.append(substructure_subset) + else: + substructure_subsets.append([fragment_substructures] + substructure_subset) + + return substructure_subsets + + +def get_bond_enthalpies(): + """ Gets predicted bond dissociation energies for each bond type and elemental composition. """ + + return {1.0: {'C': {'C': 348, 'N': 305, 'O': 360, 'P': 264, 'S': 272}, + 'N': {'C': 305, 'N': 163, 'O': 222, 'P': None, 'S': None}, + 'O': {'C': 360, 'N': 222, 'O': 146, 'P': 335, 'S': None}, + 'P': {'C': 264, 'N': None, 'O': 335, 'P': 201, 'S': None}, + 'S': {'C': 272, 'N': None, 'O': None, 'P': None, 'S': 226}}, + 1.5: {'C': {'C': 837, 'N': 890, 'O': None, 'P': None, 'S': None}, + 'N': {'C': 890, 'N': 944, 'O': None, 'P': None, 'S': None}, + 'O': {'C': None, 'N': None, 'O': None, 'P': None, 'S': None}, + 'P': {'C': None, 'N': None, 'O': None, 'P': None, 'S': None}, + 'S': {'C': None, 'N': None, 'O': None, 'P': None, 'S': None}}, + 2.0: {'C': {'C': 612, 'N': 613, 'O': 743, 'P': None, 'S': 573}, + 'N': {'C': 613, 'N': 409, 'O': 607, 'P': None, 'S': None}, + 'O': {'C': 743, 'N': 607, 'O': 496, 'P': 544, 'S': 522}, + 'P': {'C': None, 'N': None, 'O': 544, 'P': None, 'S': 335}, + 'S': {'C': 573, 'N': None, 'O': 522, 'P': 335, 'S': 425}}} + + +def substructure_combination_build(substructure_subset, configs_iso, prescribed_method, isomeric_smiles, + bond_enthalpies, max_bde): + """ + Final stage for building molecules; takes a combination of substructures (substructure_combination) and builds them + according to graphs in the substructure database. May be run in parallel. + + :param substructure_subset: Combinations of substructures for building mols. + + :param configs_iso: Possible substructure combinations extracted from the connectivity database. A tuple containing + tuples for each substructure; these tuples specify how many bonds each substructure can make. + + :param prescribed_method: If True, assumes the first substructure in `substructure_subset` to be the fragment + substructure. + + :param isomeric_smiles: True/False, should output smiles be written with isomeric information? + + :param bond_enthalpies: Dictionary of bond enthalpies, as generated by + :py:meth:`metaboblend.build_structures.get_bond_enthalpies`. + + :param retain_substructures: Whether to record the substructures used to generate final structures. + + :return: List of smiles representing molecules generated (and the substructures used to generate them). + """ + + smis = {} + + for substructure_combination in itertools.product(*substructure_subset): + + total_valence = substructure_combination[0]["valence"] + + if prescribed_method: + substructure_combination[0]["fragment"] = True + even_fragment = substructure_combination[0]["even"] + + if "ppm_error" in substructure_combination[0].keys(): + ppm_error = substructure_combination[0]["ppm_error"] + else: + ppm_error = None + + substructure_combination = sorted(substructure_combination, key=itemgetter('atoms_available', 'valence')) + + v_a = () + j = -1 + + if prescribed_method: + fragment_indexes = [] + + for i, d in enumerate(substructure_combination): + + v_a += (tuple(d["degree_atoms"].values()),) # obtain valence configuration of the set of substructures + + for atom_available in tuple(d["degree_atoms"].values()): + j += 1 + + try: + if prescribed_method: + if d["fragment"]: + fragment_indexes.append(j) + + except KeyError: + continue + + if str(v_a) not in configs_iso: # check mols "fit" together according to the connectivity database + continue + + mol_comb, atoms_available, atoms_to_remove, bond_types, bond_mismatch = reindex_atoms(substructure_combination) + + if bond_mismatch: + continue # check that bond types are compatible (imperfect check) + + for edges in configs_iso[str(v_a)]: # build mols for each graph in connectivity db + if prescribed_method: + non_fragment_edges = False + + for edge in edges: # check that edges only connect to fragment ion + if edge[0] not in fragment_indexes and edge[1] not in fragment_indexes: + non_fragment_edges = True + + if non_fragment_edges: + continue + + # add bonds between substructures + mol_e, total_bde = add_bonds(mol_comb, edges, atoms_available, bond_types, bond_enthalpies) + + if mol_e is None or total_bde is None: + continue + elif max_bde is not None: + if total_bde > max_bde: + continue + + atoms_to_remove.sort(reverse=True) + [mol_e.RemoveAtom(a) for a in atoms_to_remove] # clean up dummy atoms + + mol_out = mol_e.GetMol() # generate the final (non-editable) mol + + try: + Chem.SanitizeMol(mol_out) # clean the mol - ensure it is valid & canonical + except: + continue + + try: # append the canonical smiles of the final structure + final_structure = Chem.MolToSmiles(mol_out, isomericSmiles=isomeric_smiles) + + except RuntimeError: + continue # bad bond type violation + + final_substructures = [subs["smiles"] for subs in substructure_combination] + + # add required information to a dictionary + try: + smis[final_structure]["bde"].append(total_bde) + smis[final_structure]["valence"].append(total_valence) + + smis[final_structure]["substructures"].append(final_substructures) + + if prescribed_method: + smis[final_structure]["even"].append(even_fragment) + + if ppm_error is not None: + smis[final_structure]["ppm_error"].append(ppm_error) + + except KeyError: + smis[final_structure] = {"bde": [total_bde], "valence": [total_valence]} + + if prescribed_method: + smis[final_structure]["even"] = [even_fragment] + + if ppm_error is not None: + smis[final_structure]["ppm_error"] = [ppm_error] + + smis[final_structure]["substructures"] = [final_substructures] + + return smis diff --git a/metaboblend/data/connectivity.sqlite b/metaboblend/data/connectivity.sqlite index dc31ed0..56981d9 100644 Binary files a/metaboblend/data/connectivity.sqlite and b/metaboblend/data/connectivity.sqlite differ diff --git a/metaboblend/databases/__init__.py b/metaboblend/databases/__init__.py new file mode 100644 index 0000000..8a2d78a --- /dev/null +++ b/metaboblend/databases/__init__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# diff --git a/metaboblend/auxiliary.py b/metaboblend/databases/connectivity.py similarity index 55% rename from metaboblend/auxiliary.py rename to metaboblend/databases/connectivity.py index 4408224..6432afb 100644 --- a/metaboblend/auxiliary.py +++ b/metaboblend/databases/connectivity.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright © 2019-2020 Ralf Weber +# Copyright © 2019-2020 Jack Gisby, Ralf Weber # # This file is part of MetaboBlend. # @@ -19,9 +19,150 @@ # along with MetaboBlend. If not, see . # +import os +import io +import pickle +import sqlite3 +import tempfile import itertools +import subprocess import pylab as plt import networkx as nx +from typing import Union + + +def create_connectivity_database( + path_connectivity_db: Union[str, bytes, os.PathLike], + max_n_substructures: int = 3, + max_atoms_available: int = 2, + path_ri: Union[str, bytes, os.PathLike, None] = None +) -> None: + """ + Generates a connectivity database containing sets of possible combinations of substructures; these combinations are + represented by graphs whose vertices correspond to substructures and edges to bonds. We use geng, part of the nauty + package, along with RI3.6 to ensure that the generated graphs are non-isomorphic - i.e. we only generate each + combination of substructures once. These graphs are pickled in order to be stored in the final column of the + SQLite 3 connectivity database. + + :param path_connectivity_db: The path at which to generate the SQLite 3 database. + + :param max_n_substructures: The maximal number of substructures (vertices). At least two substructures must be + available for bonding for a graph to be created. + + :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for + building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming + chemical bonds (e.g. single or double bonds). + + :param path_ri: The path of RI, a required tool for verifying subgraph isomorphism. + """ + + conn = sqlite3.connect(path_connectivity_db) + cursor = conn.cursor() + + cursor.execute("""DROP TABLE IF EXISTS subgraphs""") + cursor.execute("""CREATE TABLE subgraphs ( + id_pkl INTEGER, + n_graphs INTEGER, + graph6 TEXT, + k INTEGER, + k_partite TEXT, + k_valences TEXT, + nodes_valences TEXT, + n_nodes INTEGER, + n_edges INTEGER, + root BLOB, + PRIMARY KEY (graph6, k_partite, nodes_valences) + );""") + conn.commit() + + id_pkl = 0 + + for g, p in calculate_complete_multipartite_graphs(max_atoms_available, max_n_substructures): + + # get complete set of non-isomorphic graphs, using geng, from a distinct multipartite graph as input + proc = subprocess.Popen(["geng", str(len(g.nodes)), "-d1", "-D2", "-q"], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) # max valence for single atom of 2 + geng_out, err = proc.communicate() + + proc.stdout.close() + proc.stderr.close() + + # pipe geng output to RI to generate mappings (complete set of non-isomorphic configurations) + for i, line_geng in enumerate(geng_out.split()): + + s_g = nx.read_graph6(io.BytesIO(line_geng)) + + k_gfu = tempfile.NamedTemporaryFile(mode="w", delete=False) + k_gfu.write(graph_to_ri(g, "k_graph")) + k_gfu.seek(0) + + s_gfu = tempfile.NamedTemporaryFile(mode="w", delete=False) + s_gfu.write(graph_to_ri(s_g, "subgraph")) + s_gfu.seek(0) + + proc = subprocess.Popen([path_ri, "mono", "geu", k_gfu.name, s_gfu.name], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) # TODO: add ri as dependency + ri_out, err = proc.communicate() + + k_gfu.close() + s_gfu.close() + + mappings = [] + subgraphs = {} + + for line in ri_out.decode("utf-8").splitlines(): + if line[0] == "{": + mappings.append(eval(line)) + + if len(mappings) > 0: + gi = graph_info(p, s_g, mappings, ) # convert mappings to valence/connectivity specifications + + for vn in gi: + if vn not in subgraphs: + subgraphs[vn] = gi[vn] + + else: + for es in gi[vn]: + if es not in subgraphs[vn]: + subgraphs[vn].append(es) + + if len(subgraphs) > 0: + for vn in subgraphs: # for each valence configuration + subgraphs[vn] = sort_subgraphs(subgraphs[vn]) # sort to remove duplicate configurations + root = {} # graph to be pickled + + for fr in subgraphs[vn]: + parent = root + for e in fr: + parent = parent.setdefault(e, {}) + + vt = tuple([sum(v) for v in eval(vn)]) + + id_pkl += 1 + cursor.execute("""INSERT INTO subgraphs ( + id_pkl, + n_graphs, + graph6, + k, + k_partite, + k_valences, + nodes_valences, + n_nodes, n_edges, + root) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", ( + id_pkl, + len(subgraphs[vn]), + line_geng, + len(p), + str(p), + str(vt), + str(vn), + s_g.number_of_nodes(), + s_g.number_of_edges(), + pickle.dumps(root))) + + conn.commit() + conn.close() def calculate_complete_multipartite_graphs(max_atoms_available, max_n_substructures): diff --git a/metaboblend/databases/results.py b/metaboblend/databases/results.py new file mode 100644 index 0000000..33925ed --- /dev/null +++ b/metaboblend/databases/results.py @@ -0,0 +1,527 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import os +import csv +import sqlite3 +from math import sqrt + + +class ResultsDb: + """ + Methods for interacting with the SQLITE3 results database, as created by + :py:meth:`metaboblend.build_structures.annotate_msn`. + + :param path_results: Directory to which results will be written. + + :param msn: If True, creates a table, "spectra", which holds the MSn spectra. + + :param retain_substructures: If True, keeps the substructure table permenantly. Else, the table will be dropped + after calculations are performed. + + :param weights: Dictionary of weights that will be passed to :py:meth:`metaboblend.results.define_scoring_function` + in order to generate an SQL function for calculating scoring weights. If None, default weights will be used. + """ + + def __init__(self, path_results, msn=True, retain_substructures=False, weights=None): + """Constructor method.""" + + self.path_results = path_results + self.path_results_db = os.path.join(self.path_results, "metaboblend_results.sqlite") + + self.retain_substructures = retain_substructures + self.msn = msn + + if weights is None: + + self.weights = {"base_peak_weight": 0.3, "bde_weight": 0.4, "even_weight": 0.2, "valence_weight": 0.1} + else: + self.weights = weights + + self.calc_substructure_combo_score = define_scoring_function(self.weights) + + self.conn = None + self.cursor = None + self.open() + + self.substructure_combo_id = 0 + + def open(self): + """ Opens connection to the SQLite3 database and creates custom functions. """ + + self.conn = sqlite3.connect(self.path_results_db) + self.cursor = self.conn.cursor() + + self.conn.create_function("CALC_SUBSTRUCTURE_COMBO_SCORE", 4, self.calc_substructure_combo_score) + + def create_results_db(self): + """ Generates a new results database. """ + + self.conn.close() + + if os.path.exists(self.path_results_db): + os.remove(self.path_results_db) + + self.open() + + self.cursor.execute("""CREATE TABLE queries ( + ms_id_num INTEGER PRIMARY KEY, + ms_id TEXT, + exact_mass NUMERIC, + C INTEGER, + H INTEGER, + N INTEGER, + O INTEGER, + P INTEGER, + S INTEGER, + ppm INTEGER, + ha_min INTEGER, + ha_max INTEGER, + max_atoms_available INTEGER, + max_degree INTEGER, + max_n_substructures INTEGER, + hydrogenation_allowance INTEGER, + isomeric_smiles INTEGER)""") + + if self.msn: + self.cursor.execute("""CREATE TABLE spectra ( + ms_id_num INTEGER, + fragment_id INTEGER, + neutral_mass NUMERIC, + max_bde NUMERIC, + PRIMARY KEY (ms_id_num, fragment_id))""") + + self.cursor.execute("""CREATE TABLE structure_smiles ( + structure_id INTEGER PRIMARY KEY, + smiles TEXT UNIQUE NOT NULL + )""") + + self.create_structures_table() + + self.cursor.execute("""CREATE TABLE results ( + ms_id_num INTEGER, + fragment_id INTEGER, + structure_id TEXT, + result_score NUMERIC, + PRIMARY KEY(ms_id_num, fragment_id, structure_id) + FOREIGN KEY (ms_id_num, structure_id) + REFERENCES structures(ms_id_num, structure_id) + FOREIGN KEY (ms_id_num, fragment_id) + REFERENCES spectra(ms_id_num, fragment_id))""") + + self.cursor.execute("""CREATE TABLE substructure_combos ( + substructure_combo_id INTEGER, + ms_id_num INTEGER, + fragment_id INTEGER, + structure_id TEXT, + bde INTEGER, + valence INTEGER, + even BOOLEAN, + substructure_combo_score NUMERIC, + PRIMARY KEY (substructure_combo_id), + FOREIGN KEY (ms_id_num, fragment_id, structure_id) + REFERENCES results(ms_id_num, fragment_id, structure_id))""") + + if self.retain_substructures: + self.cursor.execute("""CREATE TABLE substructures ( + substructure_combo_id INTEGER, + substructure_position_id INTEGER, + substructure_smiles TEXT, + PRIMARY KEY (substructure_combo_id, substructure_position_id) + FOREIGN KEY (substructure_combo_id) REFERENCES substructure_combos(substructure_combo_id))""") + + self.conn.commit() + + def create_structures_table(self): + """ Create structures table. """ + + self.cursor.execute("""CREATE TABLE structures ( + ms_id_num INTEGER, + structure_id INTEGER, + frequency INTEGER, + frequency_score NUMERIC, + PRIMARY KEY (ms_id_num, structure_id) + FOREIGN KEY (structure_id) REFERENCES structure_smiles(structure_id))""") + + def add_ms(self, msn_data, ms_id, ms_id_num, parameters): + """ + Add entries to the `queries` and `spectra` tables. + + :param msn_data: Dictionary in the form + `msn_data[id] = {mf: [C, H, N, O, P, S], exact_mass: float, fragment_masses: []}`. id represents a unique + identifier for a given spectral tree or fragmentation spectrum, mf is a list of integers referring to the + molecular formula of the structure of interest, exact_mass is the mass of this molecular formula to >=4d.p. + and fragment_masses are neutral fragment masses generated by this structure used to inform candidate + scoring. See :py:meth:`metaboblend.build_structures.annotate_msn`. + + :param ms_id: Unique identifier for the annotation of a single metabolite. + + :param ms_id_num: Unique numeric identifier for the annotation of a single metaoblite. + + :param parameters: List of parameters, in the form: [ppm, ha_min, ha_max, max_atoms_available, max_degree, + max_n_substructures, hydrogenation_allowance, isomeric_smiles]. See + :py:meth:`metaboblend.build_structures.annotate_msn`. + """ + + for i, parameter in enumerate(parameters): + if parameter is None: + parameters[i] = "NULL" + elif isinstance(parameter, bool): + parameters[i] = int(parameter) + + self.cursor.execute("""INSERT INTO queries ( + ms_id, + ms_id_num, + exact_mass, + C, H, N, O, P, S, + ppm, + ha_min, + ha_max, + max_atoms_available, + max_degree, + max_n_substructures, + hydrogenation_allowance, + isomeric_smiles + ) VALUES ('{}', {}, {}, '{}', '{}', '{}', '{}', '{}', '{}', {})""".format( + ms_id, + ms_id_num, + msn_data[ms_id]["exact_mass"], + msn_data[ms_id]["mf"][0], msn_data[ms_id]["mf"][1], + msn_data[ms_id]["mf"][2], msn_data[ms_id]["mf"][3], + msn_data[ms_id]["mf"][4], msn_data[ms_id]["mf"][5], + ", ".join([str(p) for p in parameters]) + )) + + self.conn.commit() + + def add_results(self, ms_id_num, smi_dict, fragment_mass=None, fragment_id=None): + """ + Record which smiles were generated for a given fragment mass. + + :param ms_id_num: Unique identifier for the annotation of a single metabolite. + + :param smi_dict: The fragment and substructure smiles generated by the annotation of a single peak for a single + metabolite. + + :param fragment_mass: The neutral fragment mass that has been annotated. + + :param fragment_id: The unique identifier for the fragment mass that has been annotated. + """ + + self.drop_indexes() + + # if annotating msn spectra, fill the spectra table + if self.msn: + + # get the maximum BDE across all structures generated for this particular fragment ion + max_bde = 0 + + for structure_smiles in smi_dict.keys(): + max_bde = max(max_bde, max(smi_dict[structure_smiles]["bde"])) + + self.cursor.execute("""INSERT OR IGNORE INTO spectra ( + ms_id_num, + fragment_id, + neutral_mass, + max_bde + ) VALUES ('{}', {}, {}, {}) + """.format( + ms_id_num, + fragment_id, + fragment_mass, + max_bde + )) + else: + fragment_id = "NULL" + + # unique smiles candidates for this fragment + for structure_smiles in smi_dict.keys(): + + # insert structure smiles + self.cursor.execute("INSERT OR IGNORE INTO structure_smiles (smiles) VALUES ('{}')".format(structure_smiles)) + + # get structure smiles row id + self.cursor.execute("SELECT structure_id FROM structure_smiles WHERE smiles = '{}'".format(structure_smiles)) + structure_id = self.cursor.fetchone()[0] + + # for each combination of substructures that generated the candidate + for i in range(len(smi_dict[structure_smiles]["substructures"])): + + if self.msn: + + if smi_dict[structure_smiles]["even"][i]: + even_structure = 1 + else: + even_structure = 0 + + else: + even_structure = "NULL" + + self.cursor.execute("""INSERT INTO substructure_combos ( + substructure_combo_id, + ms_id_num, + fragment_id, + structure_id, + bde, + valence, + even + ) VALUES ({}, {}, {}, '{}', {}, {}, {}) + """.format( + self.substructure_combo_id, + ms_id_num, + fragment_id, + structure_id, + smi_dict[structure_smiles]["bde"][i], + smi_dict[structure_smiles]["valence"][i], + even_structure + )) + + if self.retain_substructures: + for j, substructure in enumerate(smi_dict[structure_smiles]["substructures"][i]): + + self.cursor.execute("""INSERT INTO substructures ( + substructure_combo_id, + substructure_position_id, + substructure_smiles + ) VALUES ({}, {}, '{}') + """.format( + self.substructure_combo_id, + j, + substructure + )) + + self.substructure_combo_id += 1 + + self.cursor.execute("""INSERT INTO results ( + ms_id_num, + fragment_id, + structure_id + ) VALUES ({}, {}, '{}') + """.format( + ms_id_num, + fragment_id, + structure_id + )) + + self.conn.commit() + + def drop_indexes(self): + """ Drop indexes to improve insert performance. """ + + self.cursor.execute("""DROP INDEX IF EXISTS substructure_combos_results_reference""") + self.cursor.execute("""DROP INDEX IF EXISTS substructure_combos_ms_id_num""") + self.cursor.execute("""DROP INDEX IF EXISTS results_ms_id_num""") + self.cursor.execute("""DROP INDEX IF EXISTS results_ms_id_num_structure_smiles""") + + def create_indexes(self): + """ Create indexes for results DB query optimisation. """ + + self.drop_indexes() + + # for correlated querying if substructure combos table based on results + self.cursor.execute("""CREATE INDEX substructure_combos_results_reference + ON substructure_combos(ms_id_num, fragment_id, structure_id)""") + + self.cursor.execute("""CREATE INDEX substructure_combos_ms_id_num + ON substructure_combos(ms_id_num)""") + + self.cursor.execute("""CREATE INDEX results_ms_id_num + ON results(ms_id_num)""") + + self.cursor.execute("""CREATE INDEX results_ms_id_num_structure_smiles + ON results(ms_id_num, structure_id)""") + + def calculate_scores(self, ms_id_num): + """ + Scores cannot be calculated while generating the various tables. Must be completed after the entire structure + generation process of a metabolite. For instance, the maximum BDE across all sets of substructures generated + for a metabolite can only be ascertained once all these sets of substructures have been recorded. + + Does the calculations for aggregating structure candidate scores within SQL by updating columns that have been + ignored thus far. More complex calculations are written in python as SQL functions, as defined in + py:meth:`metaboblend.results.open`. + + :param ms_id_num: Unique identifier for the annotation of a single metabolite. + """ + + self.create_indexes() + + if not self.msn: + self.cursor.execute("""INSERT INTO structures (ms_id_num, structure_id, frequency) + SELECT ms_id_num, structure_id, COUNT(*) + FROM results + WHERE ms_id_num = {} + GROUP BY structure_id""".format(ms_id_num)) + + return + + self.cursor.execute("SELECT COUNT(*), MAX(max_bde) FROM spectra WHERE ms_id_num = %s" % ms_id_num) + num_fragments, max_bde = list(self.cursor.fetchall())[0] + + # calculate the BDE score for each combination of substructures, args = bde, max_bde, even_score, valence, ppm_error + self.cursor.execute("""UPDATE substructure_combos + SET substructure_combo_score = CALC_SUBSTRUCTURE_COMBO_SCORE(bde, {}, even, valence) + WHERE ms_id_num = {} + """.format(max_bde, ms_id_num)) + + # aggregate substructure combination scores for each peak/candidate structure + # updates results by aggregating the scores (selects the max score for the peak/candidate structure) + # then correlating the result of this query with the results table + # also gets the number of different combinations for the result which can be used for scoring + self.cursor.execute("""WITH substructure_combo_scores AS ( + SELECT MAX(substructure_combo_score) AS max_substructure_combo_score, fragment_id, structure_id + FROM substructure_combos + WHERE ms_id_num = {} + GROUP BY fragment_id, structure_id + ) + + UPDATE results + SET result_score = (SELECT max_substructure_combo_score + FROM substructure_combo_scores + WHERE fragment_id = results.fragment_id + AND structure_id = results.structure_id) + WHERE ms_id_num = {} + """.format(ms_id_num, ms_id_num)) + + # aggregate results scores across the spectrum for each unique structure candidate + self.cursor.execute("""INSERT INTO structures (ms_id_num, structure_id, frequency, frequency_score) + SELECT ms_id_num, structure_id, COUNT(*), (SUM(result_score) * 1.0) / {} + FROM results + WHERE ms_id_num = {} + GROUP BY structure_id""".format(num_fragments, ms_id_num)) + + self.conn.commit() + + def recalculate_scores(self, weights=None): + """ Re-calculates scores for the results DB. """ + + if weights is not None: + self.conn.close() + self.weights = weights + self.calc_substructure_combo_score = define_scoring_function(weights) + self.open() + + self.cursor.execute("DROP TABLE IF EXISTS structures") + self.create_structures_table() + + self.cursor.execute("SELECT DISTINCT ms_id_num FROM queries") + ms_id_nums = [row[0] for row in self.cursor.fetchall()] + + for i in ms_id_nums: + self.calculate_scores(i) + + def get_structures(self, ms_id_num): + """ + Gets smiles of generated structures. In the case of the MSn annotation workflow, also gets structure + frequencies. + + :param ms_id_num: Unique identifier for the annotation of a single metabolite. + + :return: In the case of simple structure generation, returns a set of smiles strings for output structures. + For the MSn annotation workflow, returns a dictionary with smiles as keys and the number of peaks for which + the smiles were generated as values. + """ + + if self.msn: + msn_str = ", frequency" + else: + msn_str = "" + + self.cursor.execute("""SELECT smiles{} FROM structures + LEFT JOIN structure_smiles + ON structures.structure_id = structure_smiles.structure_id + WHERE ms_id_num = {} + """.format(msn_str, ms_id_num)) + + if self.msn: + return [t for t in self.cursor.fetchall()] + else: + return [item for t in self.cursor.fetchall() for item in t] + + def generate_csv_output(self): + """ Generate CSV file output for i) queries and tool parameters and ii) structures generated. """ + + with open(os.path.join(self.path_results, "metaboblend_queries.csv"), "w", newline="") as results_file, \ + open(os.path.join(self.path_results, "metaboblend_structures.csv"), "w", newline="") as ms_file: + + results_writer = csv.writer(results_file, delimiter=",") + ms_writer = csv.writer(ms_file, delimiter=",") + + results_writer.writerow(["ms_id_num", "ms_id", "exact_mass", "C", "H", "N", "O", "P", "S", "ppm", "ha_min", "ha_max", + "max_atoms_available", "max_degree", "max_n_substructures", + "hydrogenation_allowance", "isomeric_smiles"]) + + self.cursor.execute("SELECT * FROM queries") + + for query in self.cursor.fetchall(): + results_writer.writerow(query) + + ms_writer.writerow(["ms_id", "smiles", "frequency", "structure_score"]) + + self.cursor.execute("SELECT * FROM structures") + + for structure in self.cursor.fetchall(): + ms_writer.writerow(structure) + + def close(self): + """ Close the connection to the SQLITE3 database. """ + + self.conn.close() + + +def define_scoring_function(weights): + + base_peak_weight = weights["base_peak_weight"] + bde_weight = weights["bde_weight"] + even_weight = weights["even_weight"] + valence_weight = weights["valence_weight"] + + def calc_substructure_combo_score(bde, max_bde, even_score, valence): + """ + SQL function for the calculation of scores at the results table level. + + Scores: + - base peak score + - bde_score: previously calculated BDE score + - even_score: logical, 0 if doesn't follow hydrogenation rules (MS-FINDER) + - valence: an integer with a value of greater than 0 + - ppm_error: a real number (should be 0 to 5) + + Each score should be normalised between 0 and 1 + The sum of all weights should sum to 1 + Therefore, the final returned score should be between 0 and 1 + + There are other scores that take place at results level (`calc_results_score`, below). + """ + + # MS-FINDER method of calculating bde scores + bde_score = sqrt(1 - (bde / max_bde)) + + # the base value of a peak match for the structure + base_peak_score = 1 + + # the valence of the fragment substructure + valence_score = sqrt(1 / valence) + + # calculate the score at substructure combination level, weights should add up to 1 when summed with the scoring at results level + return base_peak_weight * base_peak_score + bde_weight * bde_score + even_weight * even_score + valence_weight * valence_score + + return calc_substructure_combo_score diff --git a/metaboblend/databases.py b/metaboblend/databases/substructures.py similarity index 79% rename from metaboblend/databases.py rename to metaboblend/databases/substructures.py index 2c31d12..5699779 100644 --- a/metaboblend/databases.py +++ b/metaboblend/databases/substructures.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright © 2019-2020 Ralf Weber +# Copyright © 2019-2020 Jack Gisby, Ralf Weber # # This file is part of MetaboBlend. # @@ -19,12 +19,10 @@ # along with MetaboBlend. If not, see . # -import io import os import pickle import sqlite3 -import tempfile -import subprocess +import itertools import networkx as nx from typing import Sequence, Dict, Union @@ -32,8 +30,7 @@ from rdkit.Chem import Recap from rdkit.Chem import BRICS -from .parse import parse_xml -from .auxiliary import calculate_complete_multipartite_graphs, graph_to_ri, graph_info, sort_subgraphs +from metaboblend.parse import parse_xml class SubstructureDb: @@ -64,7 +61,9 @@ def __init__(self, path_substructure_db, path_connectivity_db=None): if self.path_connectivity_db is not None: self.cursor.execute("ATTACH DATABASE '%s' as 'graphs';" % self.path_connectivity_db) - def select_compounds(self, cpds=[]): + self.temporary_table_names = [] + + def select_compounds(self, cpds=None): """ Select all keys from the compounds table of the substructure database, filtered by HMDB IDs if provided. @@ -74,6 +73,9 @@ def select_compounds(self, cpds=[]): essentially provides a list containing a list for the values of each row. """ + if cpds is None: + cpds = [] + if len(cpds) > 0: sql = " WHERE hmdbid in ('%s')" % ("', '".join(map(str, cpds))) else: @@ -247,28 +249,35 @@ def select_mass_values(self, accuracy, masses, table_name): :return: Sorted list of mass values from the substructure database, filtered by the supplied parameters. """ - if type(masses) == list: - if len(masses) > 0: - mass_values = [] + if table_name is None: + table_name = "substructures" + else: + table_name += "_substructures" + + if type(masses) == list and len(masses) > 0: - for m in masses: - self.cursor.execute("""SELECT DISTINCT exact_mass__{} + mass_values = [] + + for m in masses: + self.cursor.execute("""SELECT DISTINCT exact_mass__{} FROM {} WHERE exact_mass__1 = {} - """.format(accuracy, table_name, m)) + """.format(accuracy, table_name, m)) + + m_values = [record[0] for record in self.cursor.fetchall()] + m_values.sort() - m_values = [record[0] for record in self.cursor.fetchall()] - m_values.sort() + mass_values.append(m_values) - mass_values.append(m_values) + return mass_values - return mass_values + else: - self.cursor.execute("SELECT DISTINCT exact_mass__{} FROM {}".format(accuracy, table_name)) - mass_values = [record[0] for record in self.cursor.fetchall()] - mass_values.sort() + self.cursor.execute("SELECT DISTINCT exact_mass__{} FROM {}".format(accuracy, table_name)) + mass_values = [record[0] for record in self.cursor.fetchall()] + mass_values.sort() - return mass_values + return mass_values def select_mfs(self, exact_mass, table_name, accuracy): """ @@ -288,10 +297,19 @@ def select_mfs(self, exact_mass, table_name, accuracy): of each row (C, H, N, O, P, S). """ + if table_name is None: + table_name = "substructures" + else: + table_name += "_substructures" + self.cursor.execute("""SELECT DISTINCT C, H, N, O, P, S - FROM {} + FROM {} WHERE exact_mass__{} = {} - """.format(table_name, accuracy, str(exact_mass))) + """.format( + table_name, + accuracy, + str(exact_mass) + )) return self.cursor.fetchall() @@ -314,7 +332,7 @@ def k_configs(self): configs[str(record[1])] = [] for path in self.paths(pickle.loads(record[0])): - configs[str(record[1])].append(path) + configs[str(record[1])].append(path) return configs @@ -353,6 +371,11 @@ def select_substructures(self, l_atoms, table_name): :py:meth:`metaboblend.databases.get_substructure`. """ + if table_name is None: + table_name = "substructures" + else: + table_name += "_substructures" + subsets = [] for i in range(len(l_atoms)): @@ -371,7 +394,8 @@ def select_substructures(self, l_atoms, table_name): AND O = {} AND P = {} AND S = {} - """.format(table_name, l_atoms[i][0], l_atoms[i][1], l_atoms[i][2], + """.format(table_name, + l_atoms[i][0], l_atoms[i][1], l_atoms[i][2], l_atoms[i][3], l_atoms[i][4], l_atoms[i][5])) records = self.cursor.fetchall() if len(records) == 0: @@ -393,15 +417,75 @@ def select_substructures(self, l_atoms, table_name): return subsets + def calculate_possible_hydrogenations(self): + """ + Calculate likely hydrogen re-arrangements, as per + :py:meth:`metaboblend.databases.calculate_hydrogen_rearrangements`. Inserts these into the substructure ions + table. + """ + + self.cursor.execute("SELECT ROWID, smiles, mol, bond_types, exact_mass FROM substructures") + + for substructure in self.cursor.fetchall(): + + mol = Chem.Mol(substructure[2]) + bond_types = eval(substructure[3]) + + fragment_ions = [] + for i, fragment_ion in enumerate(bond_types.keys()): + + atom_symbol = mol.GetAtomWithIdx(fragment_ion).GetSymbol() + + for bond_type in bond_types[fragment_ion]: + fragment_ions.append((atom_symbol, bond_type == 2)) + + positive_hydrogenations = set() + negative_hydrogenations = set() + + for fragment_ion_permutation in itertools.permutations(fragment_ions): + + positive_hydrogenations.update(calculate_hydrogen_rearrangements(fragment_ion_permutation, "+")) + negative_hydrogenations.update(calculate_hydrogen_rearrangements(fragment_ion_permutation, "-")) + + self.insert_substructure_ion(substructure, positive_hydrogenations, 1) + self.insert_substructure_ion(substructure, negative_hydrogenations, 0) + + self.conn.commit() + + def insert_substructure_ion(self, substructure, possible_hydrogenations, ion_mode): + """ + Insert substructure ions into the substructure_ions table. + + :param substructure: List, [ROWID (int), smiles (str), mol (RDKit Mol), bond_types (Dict), exact_mass (float] + + :param possible_hydrogenations: Number of hydrogens to add compared to the mass of the neutral substructure. + + :param ion_mode: If True, assumes positive ion mode, else is configured for negative ion mode. + """ + + for possible_hydrogenation in possible_hydrogenations: + + self.cursor.execute("""INSERT INTO substructure_ions ( + substructure_id, + hydrogen_modification, + ion_mode_positive, + modified_exact_mass__1, + modified_exact_mass__0_0001) + values ({}, {}, {}, {}, {})""".format( + substructure[0], + possible_hydrogenation, + ion_mode, + round(substructure[4] + (possible_hydrogenation * 1.007825), 0), + round(substructure[4] + (possible_hydrogenation * 1.007825), 4) + )) + def create_compound_database(self): """Generates a substructure database, removing previously existing tables if they are present.""" self.cursor.execute("DROP TABLE IF EXISTS compounds") self.cursor.execute("DROP TABLE IF EXISTS substructures") + self.cursor.execute("DROP TABLE IF EXISTS substructure_ions") self.cursor.execute("DROP TABLE IF EXISTS hmdbid_substructures") - self.cursor.execute("DROP TABLE IF EXISTS unique_hmdbid") - self.cursor.execute("DROP TABLE IF EXISTS filtered_hmdbid_substructures") - self.cursor.execute("DROP TABLE IF EXISTS subset_substructures") self.cursor.execute("""CREATE TABLE compounds ( hmdbid TEXT PRIMARY KEY, @@ -436,214 +520,590 @@ def create_compound_database(self): dummies TEXT, mol BLOB)""") + self.cursor.execute("""CREATE TABLE substructure_ions ( + substructure_id INTEGER, + hydrogen_modification INTEGER, + ion_mode_positive BOOLEAN, + modified_exact_mass__1 INTEGER, + modified_exact_mass__0_0001 REAL, + PRIMARY KEY (substructure_id, hydrogen_modification, ion_mode_positive), + FOREIGN KEY (substructure_id) REFERENCES substructures(substructure_id))""") + self.cursor.execute("""CREATE TABLE hmdbid_substructures ( hmdbid TEXT, substructure_id INTEGER, PRIMARY KEY (hmdbid, substructure_id), - FOREIGN KEY (substructure_id) REFERENCES substructures(substructure_id))""") + FOREIGN KEY (substructure_id) REFERENCES substructures(substructure_id), + FOREIGN KEY (hmdbid) REFERENCES compounds(hmdbid))""") - def create_indexes(self, table="substructures", selection="all"): + def create_indexes(self): """Creates indexes for the `substructures` table for use by the build method.""" - self.cursor.execute("DROP INDEX IF EXISTS mass__1") - self.cursor.execute("DROP INDEX IF EXISTS mass__0_0001") + self.cursor.execute("DROP INDEX IF EXISTS modified_exact_mass__1") + self.cursor.execute("DROP INDEX IF EXISTS modified_exact_mass__0_0001") + self.cursor.execute("DROP INDEX IF EXISTS exact_mass__1") + self.cursor.execute("DROP INDEX IF EXISTS exact_mass__0_0001") self.cursor.execute("DROP INDEX IF EXISTS atoms") - if selection != "gen_subs_table": - self.cursor.execute("DROP INDEX IF EXISTS heavy_atoms__valence__atoms_available__exact_mass__1") - self.cursor.execute("DROP INDEX IF EXISTS smiles__heavy_atoms__valence__atoms_available__exact_mass__1") - - self.cursor.execute("""CREATE INDEX mass__1 - ON %s (exact_mass__1)""" % table) - self.cursor.execute("""CREATE INDEX mass__0_0001 - ON %s (exact_mass__0_0001)""" % table) - self.cursor.execute("""CREATE INDEX atoms - ON %s (C, H, N, O, P, S);""" % table) - - if selection != "gen_subs_table": - self.cursor.execute("""CREATE INDEX heavy_atoms__valence__atoms_available__exact_mass__1 - ON %s (heavy_atoms, atoms_available, valence, exact_mass__1);""" % table) - self.cursor.execute("""CREATE INDEX smiles__heavy_atoms__valence__atoms_available__exact_mass__1 - ON %s (smiles, heavy_atoms, atoms_available, valence, exact_mass__1);""" % table) - - def close(self, clean=True): - if clean: - self.cursor.execute("DROP TABLE IF EXISTS unique_hmdbid") - self.cursor.execute("DROP TABLE IF EXISTS filtered_hmdbid_substructures") - self.cursor.execute("DROP TABLE IF EXISTS subset_substructures") + self.cursor.execute("""CREATE INDEX modified_exact_mass__1 + ON %s (modified_exact_mass__1)""" % "substructure_ions") - self.conn.close() + self.cursor.execute("""CREATE INDEX modified_exact_mass__0_0001 + ON %s (modified_exact_mass__0_0001)""" % "substructure_ions") + self.cursor.execute("""CREATE INDEX exact_mass__1 + ON %s (exact_mass__1)""" % "substructures") -def get_substructure(mol, idxs_edges_subgraph, isomeric_smiles=False): - """ - Generates information for the substructure database from a reference molecule and the bond IDs of a substructure. + self.cursor.execute("""CREATE INDEX exact_mass__0_0001 + ON %s (exact_mass__0_0001)""" % "substructures") - :param mol: An :py:meth:`rdkit.Chem.Mol` object containing a reference molecule that has been fragmented. + self.cursor.execute("""CREATE INDEX atoms ON %s (C, H, N, O, P, S);""" % "substructures") - :param idxs_edges_subgraph: Either a list of atom indices within the reference molecule that make up the - substructure (as returned by :py:meth:`metaboblend.databases.get_sgs`) or an integer representing the index - of a single atom. + self.cursor.execute("DROP INDEX IF EXISTS heavy_atoms__valence__atoms_available__exact_mass__1") + self.cursor.execute("DROP INDEX IF EXISTS smiles__heavy_atoms__valence__atoms_available__exact_mass__1") - :param isomeric_smiles: If True, returns smiles with non-structural isomeric information. + self.cursor.execute("""CREATE INDEX heavy_atoms__valence__atoms_available__exact_mass__1 + ON %s (heavy_atoms, atoms_available, valence, exact_mass__1)""" % "substructures") - :return: A list of lists containing the libs of the substructures obtained by the query; the lib is a - dictionary containing details about the substructure, in the format: + self.cursor.execute("""CREATE INDEX smiles__heavy_atoms__valence__atoms_available__exact_mass__1 + ON %s (smiles, heavy_atoms, atoms_available, valence, exact_mass__1)""" % "substructures") - * "**smiles**": Substructure smiles string + def create_temp_indexes(self, table_name): + """ Creates indexes for a temporary substructure tables. """ - * "**mol**": Substructure :py:meth:`rdkit.Chem.Mol` + self.cursor.execute("DROP INDEX IF EXISTS %s_modified_exact_mass__1" % table_name) + self.cursor.execute("DROP INDEX IF EXISTS %s_modified_exact_mass__0_0001" % table_name) + self.cursor.execute("DROP INDEX IF EXISTS %s_atoms" % table_name) + self.cursor.execute("DROP INDEX IF EXISTS %s_exact_mass__1" % table_name) + self.cursor.execute("DROP INDEX IF EXISTS %s_exact_mass__0_0001" % table_name) - * "**bond_types**": The type of bonds to be formed by dummy atoms - see - :py:meth:`metaboblend.build_structures.add_bonds` and :py:meth:`Chem.rdchem.BondType`. Is a dictionary - whose keys are atom indices and values are bond types, as follows: + self.cursor.execute("""CREATE INDEX {}_modified_exact_mass__1 + ON {} (modified_exact_mass__1) + """.format(table_name, table_name + "_substructure_ions")) - * **1.0** Single - * **1.5** Aromatic - * **2.0** Double + self.cursor.execute("""CREATE INDEX {}_modified_exact_mass__0_0001 + ON {} (modified_exact_mass__0_0001) + """.format(table_name, table_name + "_substructure_ions")) - * "**degree_atoms**": A dictionary containing indices of the atoms connected to dummy atoms that can form bonds - during structure generation as keys, and the number of bonds they can form as values. + self.cursor.execute("""CREATE INDEX {}_exact_mass__1 + ON {} (exact_mass__1) + """.format(table_name, table_name + "_substructures")) - * "**valence**": The total number of bonds that can be formed by the substructure - (the product of `degree_atoms` and `atoms_available`). + self.cursor.execute("""CREATE INDEX {}_exact_mass__0_0001 + ON {} (exact_mass__0_0001) + """.format(table_name, table_name + "_substructures")) - * "**atoms_available**": The total number of degree atoms. + self.cursor.execute("""CREATE INDEX {}_atoms ON {} (C, H, N, O, P, S) + """.format(table_name, table_name + "_substructures")) - * "**dummies**": List of the indices of atoms that may be removed to form bonds during structure generation, - represented by `*`. + def close(self): + """ Remove temporary tables from the database and close the connection. """ + + for temporary_table_name in self.temporary_table_names: + self.cursor.execute("DROP TABLE IF EXISTS %s" % temporary_table_name) + + self.temporary_table_names = [] + self.conn.close() + + +def create_substructure_database(hmdb_paths: Union[str, bytes, os.PathLike], + path_substructure_db: Union[str, bytes, os.PathLike], + ha_min: Union[int, None] = None, + ha_max: Union[int, None] = None, + max_degree: Union[int, None] = 6, + max_atoms_available: Union[int, None] = 2, + method: str = "exhaustive", + substructures_only: bool = False, + isomeric_smiles: bool = False) -> None: """ + Creates a substructure database by fragmenting one or more input molecules. Combinations of + substructures in this database are used to build new molecules. Fragmentation is carried out by selecting + connected sets bonds in the supplied compound(s). Creates the database before calling + 'metaboblend.databases.update_substructure_database' to add substructures for each input molecule. Generates + indexes on the substructure table. - # convert list of bond indices to list of atom indices - if isinstance(idxs_edges_subgraph, int): # small substructure addition - atom_idxs_subgraph = [idxs_edges_subgraph] - else: - atom_idxs_subgraph = [] - for bIdx in idxs_edges_subgraph: - b = mol.GetBondWithIdx(bIdx) - a1 = b.GetBeginAtomIdx() - a2 = b.GetEndAtomIdx() + :param hmdb_paths: The paths of the HMDB XML records detailing molecules to be fragmented. - if a1 not in atom_idxs_subgraph: - atom_idxs_subgraph.append(a1) - if a2 not in atom_idxs_subgraph: - atom_idxs_subgraph.append(a2) + :param path_substructure_db: The path of the SQLite 3 substructure database to be created. - # identify atoms which will become dummy elements in the final substructure - atoms_to_dummy = [] - for idx in atom_idxs_subgraph: - for atom in mol.GetAtomWithIdx(idx).GetNeighbors(): - if atom.GetIdx() not in atom_idxs_subgraph: - atoms_to_dummy.append(atom.GetIdx()) + :param ha_min: The minimum size (number of heavy atoms) of substructures to be added to the substructure + database. If None, no limit is applied. - mol_edit = Chem.EditableMol(mol) - degree_atoms = {} + :param ha_max: The maximum size (number of heavy atoms) of substructures to be added to the substructure + database. None, no limit is applied. - for atom in reversed(mol.GetAtoms()): + :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for + building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming + chemical bonds (e.g. single or double bonds). Atoms available are also limited by the extensivity of the + supplied connectivity database. - if atom.GetIdx() in atoms_to_dummy: - mol_edit.ReplaceAtom(atom.GetIdx(), Chem.Atom("*")) + :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We + define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, + 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For + instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have + a total degree of 3. - mol = mol_edit.GetMol() - mol_edit = Chem.EditableMol(mol) + :param method: The method by which to fragment molecules. Substructures must have an exact substructure match in + the original molecule in order to be considered valid. - for atom in reversed(mol.GetAtoms()): - if atom.GetIdx() not in atom_idxs_subgraph and atom.GetSymbol() != "*": - mol_edit.RemoveAtom(atom.GetIdx()) + * **exhaustive** The default method for substructure generation. Generates all substructures for a molecule + within the size range. See :py:meth:`rdkit.Chem.FindAllSubgraphsOfLengthMToN`. - mol_out = mol_edit.GetMol() + * **RECAP** Generates substructures using the retrosynthetic combinatorial analysis procedure; fragments are + identified that are likely to be useful for drug synthesis. See :py:meth:`rdkit.Chem.RECAP`. - dummies = [atom.GetIdx() for atom in mol_out.GetAtoms() if atom.GetSymbol() == "*"] + * **BRICS** Generates substructures by breaking retrosynthetically interesting chemical substructures; fragments + are identified that are likely to be useful for drug synthesis.. See :py:meth:`rdkit.Chem.BRICS`. - # get bond degrees - for atom in mol_out.GetAtoms(): + :param substructures_only: Whether to generate all tables or only the substructures table. Retains necessary + information for building and reduces database size. - if atom.GetIdx() in dummies: + :param isomeric_smiles: If True, generates a database using smiles with non-structural isomeric information. + """ - for atom_n in atom.GetNeighbors(): + db = SubstructureDb(path_substructure_db) + db.create_compound_database() + db.close() - if atom_n.GetSymbol() == "*": - continue # do not count dummies for valence calculations - elif atom_n.GetIdx() not in degree_atoms: - degree_atoms[atom_n.GetIdx()] = 1 - else: - degree_atoms[atom_n.GetIdx()] += 1 + for hmdb_path in hmdb_paths: + update_substructure_database(hmdb_path=hmdb_path, path_substructure_db=path_substructure_db, ha_min=ha_min, + ha_max=ha_max, method=method, max_atoms_available=max_atoms_available, + max_degree=max_degree, substructures_only=substructures_only, + isomeric_smiles=isomeric_smiles) - # returns the type of the bond as a double (i.e. 1.0 for SINGLE, 1.5 for AROMATIC, 2.0 for DOUBLE) - bond_types = {} + db = SubstructureDb(path_substructure_db) + db.calculate_possible_hydrogenations() + db.create_indexes() + db.close() - for b in mol_out.GetBonds(): - # use bond types to dummy atoms to inform future structure building from compatible substructures - if mol_out.GetAtomWithIdx(b.GetBeginAtomIdx()).GetSymbol() == "*": - if b.GetEndAtomIdx() not in bond_types: - bond_types[b.GetEndAtomIdx()] = [b.GetBondTypeAsDouble()] - else: - bond_types[b.GetEndAtomIdx()].append(b.GetBondTypeAsDouble()) +def update_substructure_database(hmdb_path: Union[str, bytes, os.PathLike, None], + path_substructure_db: Union[str, bytes, os.PathLike], + ha_min: Union[int, None] = None, + ha_max: Union[int, None] = None, + max_atoms_available: Union[int, None] = None, + max_degree: Union[int, None] = None, + method: str = "exhaustive", + substructures_only: bool = False, + records: Union[Sequence[Dict], None] = None, + isomeric_smiles: bool = False) -> None: + """ + Add entries to the substructure database by fragmenting a molecule or set of molecules. Combinations of + substructures in this database are used to build new molecules. Fragmentation is carried out by selecting + connected sets bonds in the supplied compound(s). - elif mol_out.GetAtomWithIdx(b.GetEndAtomIdx()).GetSymbol() == "*": - if b.GetBeginAtomIdx() not in bond_types: - bond_types[b.GetBeginAtomIdx()] = [b.GetBondTypeAsDouble()] - else: - bond_types[b.GetBeginAtomIdx()].append(b.GetBondTypeAsDouble()) + :param hmdb_path: The path of the HMDB XML record(s) detailing molecules to be fragmented. Can take HMDB records for + individual metabolites or the entirety of HMDB. Will be overriden by + `records` parameter, if provided. - try: - mol_out.UpdatePropertyCache() # alternative to Chem.SanitizeMol that updates valence information - except: - return + :param path_substructure_db: The path of the existing SQLite 3 substructure database to be updated. - return {"smiles": Chem.MolToSmiles(mol_out, isomericSmiles=isomeric_smiles), # REORDERED ATOM INDEXES - "mol": mol_out, - "bond_types": bond_types, - "degree_atoms": degree_atoms, - "valence": sum(degree_atoms.values()), - "atoms_available": len(degree_atoms.keys()), - "dummies": dummies} + :param ha_min: The minimum size (number of heavy atoms) of substructures to be added to the substructure + database. If None, no limit is applied. + :param ha_max: The maximum size (number of heavy atoms) of substructures to be added to the substructure + database. None, no limit is applied. -def get_elements(mol, elements=None): - """ - Gets the elemental composition of a molecule. + :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for + building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming + chemical bonds (e.g. single or double bonds). Atoms available are also limited by the extensivity of the + supplied connectivity database. - :param mol: An :py:meth:`rdkit.Chem.Mol` object containing the molecule of interest. + :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We + define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, + 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For + instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have + a total degree of 3. - :param elements: A dictionary whose keys are strings representing an element and values are 0. Unspecified, - defaults to `{"C": 0, "H": 0, "N": 0, "O": 0, "P": 0, "S": 0, "*": 0}`. + :param method: The method by which to fragment molecules. Substructures must have an exact substructure match in + the original molecule in order to be considered valid. - :return: The dictionary specified by **elements**, with the number of atoms corresponding to each element as keys. - """ + * **exhaustive** The default method for substructure generation. Generates all substructures for a molecule + within the size range. See :py:meth:`rdkit.Chem.FindAllSubgraphsOfLengthMToN`. - if not elements: - elements = {"C": 0, "H": 0, "N": 0, "O": 0, "P": 0, "S": 0, "*": 0} + * **RECAP** Generates substructures using the retrosynthetic combinatorial analysis procedure; fragments are + identified that are likely to be useful for drug synthesis. See :py:meth:`rdkit.Chem.RECAP`. - mol = Chem.AddHs(mol) - for atom in mol.GetAtoms(): - elements[atom.GetSymbol()] += 1 + * **BRICS** Generates substructures by breaking retrosynthetically interesting chemical substructures; fragments + are identified that are likely to be useful for drug synthesis.. See :py:meth:`rdkit.Chem.BRICS`. - return elements + :param substructures_only: Whether to generate all tables or only the substructures table. Retains necessary + information for building and reduces database size. + :param records: Records of molecules to be fragmented. Must be a list containing dictionaries containing key + information about the molecules, as generated by :py:meth:`metaboblend.databases.parse_xml`; if records + is not supplied, the records will be obtained from the XML at `hmdb_path`. -def calculate_exact_mass(mol, exact_mass_elements=None): + :param isomeric_smiles: If True, generates a database using smiles with non-structural isomeric information. """ - Gets the exact mass of a molecule. - :param mol: An :py:meth:`rdkit.Chem.Mol` object containing the molecule of interest. + conn = sqlite3.connect(path_substructure_db) + cursor = conn.cursor() - :param exact_mass_elements: A dictionary whose keys are strings representing an element and values are the exact masses of - each element. Unspecified, defaults to : - `{"C": 12.0, "H": 1.007825, "N": 14.003074, "O": 15.994915, "P": 30.973763, "S": 31.972072, "*": -1.007825}` + if records is None: + records = parse_xml(hmdb_path, reformat=False) - :return: The exact mass of the molecule. - """ + if ha_min is None: + ha_min = 1 - if not exact_mass_elements: - exact_mass_elements = {"C": 12.0, "H": 1.007825, "N": 14.003074, "O": 15.994915, "P": 30.973763, "S": 31.972072, - "*": -1.007825} + if ha_max is None: + ha_max = 9999 + + for record_dict in filter_records(records, isomeric_smiles=isomeric_smiles): + if not substructures_only: + cursor.execute("""INSERT OR IGNORE INTO compounds ( + hmdbid, + exact_mass, + formula, + C, H, N, O, P, S, + smiles) + VALUES ( + :HMDB_ID, + :exact_mass, + :formula, + :C, :H, :N, :O, :P, :S, + :smiles)""", record_dict) + + # Returns a tuple of 2-tuples with bond IDs + for sgs in get_sgs(record_dict=record_dict, n_min=ha_min-1, n_max=ha_max-1, method=method): + for edge_idxs in sgs: + lib = get_substructure(record_dict["mol"], edge_idxs, isomeric_smiles=isomeric_smiles) # convert bond IDs to substructure mol + + # insert substructure obtained from get_sgs + insert_substructure(lib, cursor, record_dict, substructures_only, max_atoms_available, max_degree, + isomeric_smiles) + + if ha_min <= 1: + for atom in record_dict["mol"].GetAtoms(): + lib = get_substructure(record_dict["mol"], atom.GetIdx(), isomeric_smiles=isomeric_smiles) + + # insert single atom substructures + insert_substructure(lib, cursor, record_dict, substructures_only, max_atoms_available, max_degree, + isomeric_smiles) + + conn.commit() + conn.close() + + +def insert_substructure(lib, cursor, record_dict, substructures_only, max_atoms_available, max_degree, isomeric_smiles): + """ + Converts the details of a single substructure into an entry in a substructure database. See + :py:meth:`update_substructure_database`. + + :param lib: A dictionary containing details about the substructure, as returned by + :py:meth:`metaboblend.databases.get_substructure`, in the format: + + * "**smiles**": Substructure smiles string + + * "**mol**": Substructure :py:meth:`rdkit.Chem.Mol` + + * "**bond_types**": The type of bonds to be formed by dummy atoms - see + :py:meth:`metaboblend.build_structures.add_bonds` and :py:meth:`Chem.rdchem.BondType`. Is a dictionary + whose keys are atom indices and values are bond types, as follows: + + * **1.0** Single + * **1.5** Aromatic + * **2.0** Double + + * "**degree_atoms**": A dictionary containing indices of the atoms connected to dummy atoms that can form bonds + during structure generation as keys, and the number of bonds they can form as values. + + * "**valence**": The total number of bonds that can be formed by the substructure + (the product of `degree_atoms` and `atoms_available`). + + * "**atoms_available**": The total number of degree atoms. + + * "**dummies**": List of the indices of atoms that may be removed to form bonds during structure generation, + represented by `*`. + + :param cursor: SQLite3 cursor connected to the substructure database. Used to insert substructures. + + :param record_dict: Record of molecule to be fragmented. Must be a dictionary containing key + information about the molecule, as generated by :py:meth:`metaboblend.databases.parse_xml`. + + :param substructures_only: Whether to generate all tables or only the substructures table. Retains necessary + information for building and reduces database size. + + :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for + building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming + chemical bonds (e.g. single or double bonds). Atoms available are also limited by the extensivity of the + supplied connectivity database. + + :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We + define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, + 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For + instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have + a total degree of 3. + + :param isomeric_smiles: If True, generates database entries using smiles with non-structural isomeric information. + """ + + if lib is None: + return + + if lib["valence"] == 0: + return + + if max_atoms_available is not None: + if lib["atoms_available"] > max_atoms_available: + return + + if max_degree is not None: + if lib["valence"] > max_degree: + return + + smiles_rdkit = Chem.MolToSmiles(lib["mol"], isomericSmiles=isomeric_smiles) # canonical rdkit smiles + + exact_mass = calculate_exact_mass(lib["mol"]) + els = get_elements(lib["mol"]) + + sub_smi_dict = {'smiles': smiles_rdkit, + 'exact_mass': exact_mass, + 'length': sum([els[atom] for atom in els if atom != "*"]), + "valence": lib["valence"], + "valence_atoms": str(lib["degree_atoms"]), + "atoms_available": lib["atoms_available"], + "mol": lib["mol"].ToBinary(), + "bond_types": str(lib["bond_types"]), + "dummies": str(lib["dummies"])} + + sub_smi_dict["exact_mass__1"] = round(sub_smi_dict["exact_mass"], 0) + sub_smi_dict["exact_mass__0_0001"] = round(sub_smi_dict["exact_mass"], 4) + + sub_smi_dict.update(els) + sub_smi_dict["heavy_atoms"] = sum([els[atom] for atom in els if atom != "H" and atom != "*"]) + + cursor.execute("""INSERT OR IGNORE INTO substructures ( + smiles, + heavy_atoms, + length, + exact_mass__1, + exact_mass__0_0001, + exact_mass, + C, + H, + N, + O, + P, + S, + valence, + valence_atoms, + atoms_available, + bond_types, + dummies, + mol) + values ( + :smiles, + :heavy_atoms, + :length, + :exact_mass__1, + :exact_mass__0_0001, + :exact_mass, + :C, + :H, + :N, + :O, + :P, + :S, + :valence, + :valence_atoms, + :atoms_available, + :bond_types, + :dummies, + :mol)""", sub_smi_dict) + + if not substructures_only: + cursor.execute("SELECT substructure_id FROM substructures WHERE smiles = '%s'" % sub_smi_dict["smiles"]) + + cursor.execute("""INSERT OR IGNORE INTO hmdbid_substructures ( + hmdbid, + substructure_id) + VALUES ('{}', {})""".format(record_dict['HMDB_ID'], cursor.fetchall()[0][0])) + + +def get_substructure(mol, idxs_edges_subgraph, isomeric_smiles=False): + """ + Generates information for the substructure database from a reference molecule and the bond IDs of a substructure. + + :param mol: An :py:meth:`rdkit.Chem.Mol` object containing a reference molecule that has been fragmented. + + :param idxs_edges_subgraph: Either a list of atom indices within the reference molecule that make up the + substructure (as returned by :py:meth:`metaboblend.databases.get_sgs`) or an integer representing the index + of a single atom. + + :param isomeric_smiles: If True, returns smiles with non-structural isomeric information. + + :return: A list of lists containing the libs of the substructures obtained by the query; the lib is a + dictionary containing details about the substructure, in the format: + + * "**smiles**": Substructure smiles string + + * "**mol**": Substructure :py:meth:`rdkit.Chem.Mol` + + * "**bond_types**": The type of bonds to be formed by dummy atoms - see + :py:meth:`metaboblend.build_structures.add_bonds` and :py:meth:`Chem.rdchem.BondType`. Is a dictionary + whose keys are atom indices and values are bond types, as follows: + + * **1.0** Single + * **1.5** Aromatic + * **2.0** Double + + * "**degree_atoms**": A dictionary containing indices of the atoms connected to dummy atoms that can form bonds + during structure generation as keys, and the number of bonds they can form as values. + + * "**valence**": The total number of bonds that can be formed by the substructure + (the product of `degree_atoms` and `atoms_available`). + + * "**atoms_available**": The total number of degree atoms. + + * "**dummies**": List of the indices of atoms that may be removed to form bonds during structure generation, + represented by `*`. + """ + + # convert list of bond indices to list of atom indices + if isinstance(idxs_edges_subgraph, int): # small substructure addition + atom_idxs_subgraph = [idxs_edges_subgraph] + else: + atom_idxs_subgraph = [] + for bIdx in idxs_edges_subgraph: + b = mol.GetBondWithIdx(bIdx) + a1 = b.GetBeginAtomIdx() + a2 = b.GetEndAtomIdx() + + if a1 not in atom_idxs_subgraph: + atom_idxs_subgraph.append(a1) + if a2 not in atom_idxs_subgraph: + atom_idxs_subgraph.append(a2) + + # identify atoms which will become dummy elements in the final substructure + atoms_to_dummy = [] + for idx in atom_idxs_subgraph: + for atom in mol.GetAtomWithIdx(idx).GetNeighbors(): + if atom.GetIdx() not in atom_idxs_subgraph: + atoms_to_dummy.append(atom.GetIdx()) + + mol_edit = Chem.EditableMol(mol) + degree_atoms = {} + + for atom in reversed(mol.GetAtoms()): + + if atom.GetIdx() in atoms_to_dummy: + mol_edit.ReplaceAtom(atom.GetIdx(), Chem.Atom("*")) + + mol = mol_edit.GetMol() + mol_edit = Chem.EditableMol(mol) + + for atom in reversed(mol.GetAtoms()): + if atom.GetIdx() not in atom_idxs_subgraph and atom.GetSymbol() != "*": + mol_edit.RemoveAtom(atom.GetIdx()) + + mol_out = mol_edit.GetMol() + + dummies = [atom.GetIdx() for atom in mol_out.GetAtoms() if atom.GetSymbol() == "*"] + + # get bond degrees + for atom in mol_out.GetAtoms(): + + if atom.GetIdx() in dummies: + + for atom_n in atom.GetNeighbors(): + + if atom_n.GetSymbol() == "*": + continue # do not count dummies for valence calculations + elif atom_n.GetIdx() not in degree_atoms: + degree_atoms[atom_n.GetIdx()] = 1 + else: + degree_atoms[atom_n.GetIdx()] += 1 + + # returns the type of the bond as a double (i.e. 1.0 for SINGLE, 1.5 for AROMATIC, 2.0 for DOUBLE) + bond_types = {} + + for b in mol_out.GetBonds(): + + begin_atom = mol_out.GetAtomWithIdx(b.GetBeginAtomIdx()) + end_atom = mol_out.GetAtomWithIdx(b.GetEndAtomIdx()) + + # use bond types to dummy atoms to inform future structure building from compatible substructures + if begin_atom.GetSymbol() == "*" and end_atom.GetSymbol() != "*": # do not count dummy-dummy bonds for valence calculations + if b.GetEndAtomIdx() not in bond_types: + bond_types[b.GetEndAtomIdx()] = [b.GetBondTypeAsDouble()] + else: + bond_types[b.GetEndAtomIdx()].append(b.GetBondTypeAsDouble()) + + elif end_atom.GetSymbol() == "*" and begin_atom.GetSymbol() != "*": + if b.GetBeginAtomIdx() not in bond_types: + bond_types[b.GetBeginAtomIdx()] = [b.GetBondTypeAsDouble()] + else: + bond_types[b.GetBeginAtomIdx()].append(b.GetBondTypeAsDouble()) + + try: + mol_out.UpdatePropertyCache() # alternative to Chem.SanitizeMol that updates valence information + except: + return + + valence = 0 + for atom_available in bond_types.values(): + for bond_type in atom_available: + if bond_type == 2: + valence += 2 + else: + valence += 1 + + return {"smiles": Chem.MolToSmiles(mol_out, isomericSmiles=isomeric_smiles), # REORDERED ATOM INDEXES + "mol": mol_out, + "bond_types": bond_types, + "degree_atoms": degree_atoms, + "valence": valence, + "atoms_available": len(degree_atoms.keys()), + "dummies": dummies} + + +def get_elements(mol, elements=None): + """ + Gets the elemental composition of a molecule. + + :param mol: An :py:meth:`rdkit.Chem.Mol` object containing the molecule of interest. + + :param elements: A dictionary whose keys are strings representing an element and values are 0. Unspecified, + defaults to `{"C": 0, "H": 0, "N": 0, "O": 0, "P": 0, "S": 0, "*": 0}`. + + :return: The dictionary specified by **elements**, with the number of atoms corresponding to each element as keys. + """ + + if not elements: + elements = {"C": 0, "H": 0, "N": 0, "O": 0, "P": 0, "S": 0, "*": 0} + + mol = Chem.AddHs(mol) + for atom in mol.GetAtoms(): + + elements[atom.GetSymbol()] += 1 + + return elements + + +def calculate_exact_mass(mol, exact_mass_elements=None): + """ + Gets the exact mass of a molecule. + + :param mol: An :py:meth:`rdkit.Chem.Mol` object containing the molecule of interest. + + :param exact_mass_elements: A dictionary whose keys are strings representing an element and values are the exact masses of + each element. Unspecified, defaults to : + `{"C": 12.0, "H": 1.007825, "N": 14.003074, "O": 15.994915, "P": 30.973763, "S": 31.972072, "*": -1.007825}` + + :return: The exact mass of the molecule. + """ + + if not exact_mass_elements: + exact_mass_elements = {"C": 12.0, "H": 1.007825, "N": 14.003074, "O": 15.994915, "P": 30.973763, "S": 31.972072} + + exact_mass = 0.0 + mol = Chem.AddHs(mol) - exact_mass = 0.0 - mol = Chem.AddHs(mol) for atom in mol.GetAtoms(): + atom_symbol = atom.GetSymbol() + if atom_symbol != "*": exact_mass += exact_mass_elements[atom_symbol] @@ -725,565 +1185,180 @@ def filter_records(records, isomeric_smiles=False): def get_substructure_bond_idx(prb_mol, ref_mol): """ Takes a substructure and the original molecule from which it was generated and matches the substructure to its - original position in the reference molecule. Will only find a single solution, whilst multiple are possible. - - :param prb_mol: Substructure (without dummy atoms) as an :py:meth:`rdkit.Chem.Mol` object. - - :param ref_mol: Original molecule for the substructure to be matched in as an :py:meth:`rdkit.Chem.Mol` object. - - :returns: None if there is no match for the **prb_mol** in the **ref_mol**, else returns a tuple of the bond indices - in the **ref_mol* matched by the **prb_mol**. - """ - - if ref_mol.HasSubstructMatch(prb_mol): - atom_idx = ref_mol.GetSubstructMatch(prb_mol) - else: - return None - - bond_idx = () - for atom in ref_mol.GetAtoms(): - if atom.GetIdx() in atom_idx: - for bond in atom.GetBonds(): - if bond.GetBeginAtomIdx() in atom_idx and bond.GetEndAtomIdx() in atom_idx: - if bond.GetIdx() not in bond_idx: - bond_idx = (*bond_idx, bond.GetIdx()) - - return bond_idx - - -def subset_sgs_sizes(sgs, n_min, n_max): - """ - Some substructure generation methods require that their results be filtered to ensure that they are within the - databvase substructure size requirements. - - :param sgs: A list of lists containing the indices of edges in the original molecule that represent its - substructures. - - :param n_min: The minimum number of bonds (edges) for a valid substructure. - - :param n_max: The maximum number of bonds (edges) for a valid substructure. - - :return: The original sgs list, with those substructures that are not within n_min and n_max (inclusive) removed. - """ - - sgs_new = [] - - for i, edge_idxs in enumerate(sgs): - edge_idxs_new = [] - - for j, bonds in enumerate(edge_idxs): - - if n_max is None: - if n_min <= len(bonds): - edge_idxs_new.append(bonds) - - else: - if n_min <= len(bonds) <= n_max: - edge_idxs_new.append(bonds) - - if len(edge_idxs_new) > 0: - sgs_new.append(edge_idxs_new) - - return sgs_new - - -def get_sgs(record_dict, n_min, n_max, method="exhaustive"): - """ - Generates substructures based on an original molecule, which are described by sets of its bond indices. - - :param record_dict: A dictionary of key information about the original molecule, as generated by - :py:meth:`metaboblend.databases.filter_records`. Includes HMDBID, smiles and the related - :py:meth:`rdkit.Chem.Mol` representation. - - :param n_min: The minimum number of bonds (edges) for a valid substructure. - - :param n_max: The maximum number of bonds (edges) for a valid substructure. - - :param method: The method by which to fragment molecules. Substructures must have an exact substructure match in - the original molecule in order to be considered valid. - - * **exhaustive** The default method for substructure generation. Generates all substructures for a molecule - within the size range. See :py:meth:`rdkit.Chem.FindAllSubgraphsOfLengthMToN`. - - * **RECAP** Generates substructures using the retrosynthetic combinatorial analysis procedure; fragments are - identified that are likely to be useful for drug synthesis. See :py:meth:`rdkit.Chem.RECAP`. - - * **BRICS** Generates substructures by breaking retrosynthetically interesting chemical substructures; fragments - are identified that are likely to be useful for drug synthesis.. See :py:meth:`rdkit.Chem.BRICS`. - - :return: A list of lists of bond indices referring to substructures of the original molecule. - """ - - if method == "exhaustive": - - if n_max is None: - n_max = 1000 - - return Chem.FindAllSubgraphsOfLengthMToN(record_dict["mol"], n_min, n_max) - - elif method == "RECAP": - - hierarchy = Recap.RecapDecompose(record_dict["mol"]) - sgs = [] - for substructure in hierarchy.GetAllChildren().values(): - - substructure = Chem.DeleteSubstructs(substructure.mol, Chem.MolFromSmarts('[#0]')) - edge_idxs = get_substructure_bond_idx(substructure, record_dict["mol"]) - if edge_idxs is not None: - sgs.append(edge_idxs) - - return subset_sgs_sizes(sgs=[sgs], n_min=n_min, n_max=n_max) - - elif method == "BRICS": - - substructures = BRICS.BRICSDecompose(record_dict["mol"]) - sgs = [] - for substructure in substructures: - substructure = Chem.DeleteSubstructs(Chem.MolFromSmiles(substructure), Chem.MolFromSmarts('[#0]')) - edge_idxs = get_substructure_bond_idx(substructure, record_dict["mol"]) - - if edge_idxs is not None: - sgs.append(edge_idxs) - - return subset_sgs_sizes(sgs=[sgs], n_min=n_min, n_max=n_max) - - -def create_substructure_database(hmdb_paths: Union[str, bytes, os.PathLike], - path_substructure_db: Union[str, bytes, os.PathLike], - ha_min: Union[int, None] = None, - ha_max: Union[int, None] = None, - max_degree: Union[int, None] = None, - max_atoms_available: Union[int, None] = None, - method: str = "exhaustive", - substructures_only: bool = False, - isomeric_smiles: bool = False) -> None: - """ - Creates a substructure database by fragmenting one or more input molecules. Combinations of - substructures in this database are used to build new molecules. Fragmentation is carried out by selecting - connected sets bonds in the supplied compound(s). Creates the database before calling - 'metaboverse.databases.update_substructure_database' to add substructures for each input molecule. Generates - indexes on the substructure table. - - :param hmdb_paths: The paths of the HMDB XML records detailing molecules to be fragmented. - - :param path_substructure_db: The path of the SQLite 3 substructure database to be created. - - :param ha_min: The minimum size (number of heavy atoms) of substructures to be added to the substructure - database. If None, no limit is applied. - - :param ha_max: The maximum size (number of heavy atoms) of substructures to be added to the substructure - database. None, no limit is applied. - - :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for - building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming - chemical bonds (e.g. single or double bonds). Atoms available are also limited by the extensivity of the - supplied connectivity database. - - :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We - define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, - 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For - instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have - a total degree of 3. - - :param method: The method by which to fragment molecules. Substructures must have an exact substructure match in - the original molecule in order to be considered valid. - - * **exhaustive** The default method for substructure generation. Generates all substructures for a molecule - within the size range. See :py:meth:`rdkit.Chem.FindAllSubgraphsOfLengthMToN`. - - * **RECAP** Generates substructures using the retrosynthetic combinatorial analysis procedure; fragments are - identified that are likely to be useful for drug synthesis. See :py:meth:`rdkit.Chem.RECAP`. - - * **BRICS** Generates substructures by breaking retrosynthetically interesting chemical substructures; fragments - are identified that are likely to be useful for drug synthesis.. See :py:meth:`rdkit.Chem.BRICS`. - - :param substructures_only: Whether to generate all tables or only the substructures table. Retains necessary - information for building and reduces database size. - - :param isomeric_smiles: If True, generates a database using smiles with non-structural isomeric information. - """ - - db = SubstructureDb(path_substructure_db) - db.create_compound_database() - db.close() - - for hmdb_path in hmdb_paths: - update_substructure_database(hmdb_path=hmdb_path, path_substructure_db=path_substructure_db, ha_min=ha_min, - ha_max=ha_max, method=method, max_atoms_available=max_atoms_available, - max_degree=max_degree, substructures_only=substructures_only, - isomeric_smiles=isomeric_smiles) - - db = SubstructureDb(path_substructure_db) - db.create_indexes() - db.close() - - -def update_substructure_database(hmdb_path: Union[str, bytes, os.PathLike, None], - path_substructure_db: Union[str, bytes, os.PathLike], - ha_min: Union[int, None] = None, - ha_max: Union[int, None] = None, - max_atoms_available: Union[int, None] = None, - max_degree: Union[int, None] = None, - method: str = "exhaustive", - substructures_only: bool = False, - records: Union[Sequence[Dict], None] = None, - isomeric_smiles: bool = False) -> None: - """ - Add entries to the substructure database by fragmenting a molecule or set of molecules. Combinations of - substructures in this database are used to build new molecules. Fragmentation is carried out by selecting - connected sets bonds in the supplied compound(s). - - :param hmdb_path: The path of the HMDB XML record(s) detailing molecules to be fragmented. Can take HMDB records for - individual metabolites or the entirety of HMDB. Will be overriden by - `records` parameter, if provided. + original position in the reference molecule. Will only find a single solution, whilst multiple are possible. - :param path_substructure_db: The path of the existing SQLite 3 substructure database to be updated. + :param prb_mol: Substructure (without dummy atoms) as an :py:meth:`rdkit.Chem.Mol` object. - :param ha_min: The minimum size (number of heavy atoms) of substructures to be added to the substructure - database. If None, no limit is applied. + :param ref_mol: Original molecule for the substructure to be matched in as an :py:meth:`rdkit.Chem.Mol` object. - :param ha_max: The maximum size (number of heavy atoms) of substructures to be added to the substructure - database. None, no limit is applied. + :returns: None if there is no match for the **prb_mol** in the **ref_mol**, else returns a tuple of the bond indices + in the **ref_mol* matched by the **prb_mol**. + """ - :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for - building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming - chemical bonds (e.g. single or double bonds). Atoms available are also limited by the extensivity of the - supplied connectivity database. + if ref_mol.HasSubstructMatch(prb_mol): + atom_idx = ref_mol.GetSubstructMatch(prb_mol) + else: + return None - :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We - define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, - 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For - instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have - a total degree of 3. + bond_idx = () + for atom in ref_mol.GetAtoms(): + if atom.GetIdx() in atom_idx: + for bond in atom.GetBonds(): + if bond.GetBeginAtomIdx() in atom_idx and bond.GetEndAtomIdx() in atom_idx: + if bond.GetIdx() not in bond_idx: + bond_idx = (*bond_idx, bond.GetIdx()) - :param method: The method by which to fragment molecules. Substructures must have an exact substructure match in - the original molecule in order to be considered valid. + return bond_idx - * **exhaustive** The default method for substructure generation. Generates all substructures for a molecule - within the size range. See :py:meth:`rdkit.Chem.FindAllSubgraphsOfLengthMToN`. - * **RECAP** Generates substructures using the retrosynthetic combinatorial analysis procedure; fragments are - identified that are likely to be useful for drug synthesis. See :py:meth:`rdkit.Chem.RECAP`. +def subset_sgs_sizes(sgs, n_min, n_max): + """ + Some substructure generation methods require that their results be filtered to ensure that they are within the + databvase substructure size requirements. - * **BRICS** Generates substructures by breaking retrosynthetically interesting chemical substructures; fragments - are identified that are likely to be useful for drug synthesis.. See :py:meth:`rdkit.Chem.BRICS`. + :param sgs: A list of lists containing the indices of edges in the original molecule that represent its + substructures. - :param substructures_only: Whether to generate all tables or only the substructures table. Retains necessary - information for building and reduces database size. + :param n_min: The minimum number of bonds (edges) for a valid substructure. - :param records: Records of molecules to be fragmented. Must be a list containing dictionaries containing key - information about the molecules, as generated by :py:meth:`metaboblend.databases.parse_xml`; if records - is not supplied, the records will be obtained from the XML at `hmdb_path`. + :param n_max: The maximum number of bonds (edges) for a valid substructure. - :param isomeric_smiles: If True, generates a database using smiles with non-structural isomeric information. + :return: The original sgs list, with those substructures that are not within n_min and n_max (inclusive) removed. """ - conn = sqlite3.connect(path_substructure_db) - cursor = conn.cursor() + sgs_new = [] - if records is None: - records = parse_xml(hmdb_path, reformat=False) + for i, edge_idxs in enumerate(sgs): + edge_idxs_new = [] - if ha_min is None: - ha_min = 0 + for j, bonds in enumerate(edge_idxs): - substructure_id = 0 + if n_max is None: + if n_min <= len(bonds): + edge_idxs_new.append(bonds) - for record_dict in filter_records(records, isomeric_smiles=isomeric_smiles): - if not substructures_only: - cursor.execute("""INSERT OR IGNORE INTO compounds ( - hmdbid, - exact_mass, - formula, - C, H, N, O, P, S, - smiles) - VALUES ( - :HMDB_ID, - :exact_mass, - :formula, - :C, :H, :N, :O, :P, :S, - :smiles)""", record_dict) + else: + if n_min <= len(bonds) <= n_max: + edge_idxs_new.append(bonds) - # Returns a tuple of 2-tuples with bond IDs - for sgs in get_sgs(record_dict=record_dict, n_min=ha_min-1, n_max=ha_max-1, method=method): - for edge_idxs in sgs: - lib = get_substructure(record_dict["mol"], edge_idxs, isomeric_smiles=isomeric_smiles) # convert bond IDs to substructure mol + if len(edge_idxs_new) > 0: + sgs_new.append(edge_idxs_new) - # insert substructure obtained from get_sgs - insert_substructure(lib, cursor, record_dict, substructures_only, max_atoms_available, max_degree, - isomeric_smiles) + return sgs_new - if ha_min <= 1: - for atom in record_dict["mol"].GetAtoms(): - lib = get_substructure(record_dict["mol"], atom.GetIdx(), isomeric_smiles=isomeric_smiles) - # insert single atom substructures - insert_substructure(lib, cursor, record_dict, substructures_only, max_atoms_available, max_degree, - isomeric_smiles) +def get_sgs(record_dict, n_min, n_max, method="exhaustive"): + """ + Generates substructures based on an original molecule, which are described by sets of its bond indices. - conn.commit() - conn.close() + :param record_dict: A dictionary of key information about the original molecule, as generated by + :py:meth:`metaboblend.databases.filter_records`. Includes HMDBID, smiles and the related + :py:meth:`rdkit.Chem.Mol` representation. + :param n_min: The minimum number of bonds (edges) for a valid substructure. -def insert_substructure(lib, cursor, record_dict, substructures_only, max_atoms_available, max_degree, isomeric_smiles): - """ - Converts the details of a single substructure into an entry in a substructure database. See - :py:meth:`update_substructure_database`. + :param n_max: The maximum number of bonds (edges) for a valid substructure. - :param lib: A dictionary containing details about the substructure, as returned by - :py:meth:`metaboblend.databases.get_substructure`, in the format: + :param method: The method by which to fragment molecules. Substructures must have an exact substructure match in + the original molecule in order to be considered valid. - * "**smiles**": Substructure smiles string + * **exhaustive** The default method for substructure generation. Generates all substructures for a molecule + within the size range. See :py:meth:`rdkit.Chem.FindAllSubgraphsOfLengthMToN`. - * "**mol**": Substructure :py:meth:`rdkit.Chem.Mol` + * **RECAP** Generates substructures using the retrosynthetic combinatorial analysis procedure; fragments are + identified that are likely to be useful for drug synthesis. See :py:meth:`rdkit.Chem.RECAP`. - * "**bond_types**": The type of bonds to be formed by dummy atoms - see - :py:meth:`metaboblend.build_structures.add_bonds` and :py:meth:`Chem.rdchem.BondType`. Is a dictionary - whose keys are atom indices and values are bond types, as follows: + * **BRICS** Generates substructures by breaking retrosynthetically interesting chemical substructures; fragments + are identified that are likely to be useful for drug synthesis.. See :py:meth:`rdkit.Chem.BRICS`. - * **1.0** Single - * **1.5** Aromatic - * **2.0** Double + :return: A list of lists of bond indices referring to substructures of the original molecule. + """ - * "**degree_atoms**": A dictionary containing indices of the atoms connected to dummy atoms that can form bonds - during structure generation as keys, and the number of bonds they can form as values. + if method == "exhaustive": - * "**valence**": The total number of bonds that can be formed by the substructure - (the product of `degree_atoms` and `atoms_available`). + if n_max is None: + n_max = 1000 - * "**atoms_available**": The total number of degree atoms. + return Chem.FindAllSubgraphsOfLengthMToN(record_dict["mol"], n_min, n_max) - * "**dummies**": List of the indices of atoms that may be removed to form bonds during structure generation, - represented by `*`. + elif method == "RECAP": - :param cursor: SQLite3 cursor connected to the substructure database. Used to insert substructures. + hierarchy = Recap.RecapDecompose(record_dict["mol"]) + sgs = [] + for substructure in hierarchy.GetAllChildren().values(): - :param record_dict: Record of molecule to be fragmented. Must be a dictionary containing key - information about the molecule, as generated by :py:meth:`metaboblend.databases.parse_xml`. + substructure = Chem.DeleteSubstructs(substructure.mol, Chem.MolFromSmarts('[#0]')) + edge_idxs = get_substructure_bond_idx(substructure, record_dict["mol"]) + if edge_idxs is not None: + sgs.append(edge_idxs) - :param substructures_only: Whether to generate all tables or only the substructures table. Retains necessary - information for building and reduces database size. + return subset_sgs_sizes(sgs=[sgs], n_min=n_min, n_max=n_max) - :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for - building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming - chemical bonds (e.g. single or double bonds). Atoms available are also limited by the extensivity of the - supplied connectivity database. + elif method == "BRICS": - :param max_degree: The maximum allowable degree of substructures to be considered for building structures. We - define degree as the product of `atoms_available` and the degree of their bonds (bond types, where 1 = single, - 2 = double, etc.). Maximum degree is also limited by the extensivity of the supplied connectivity database. For - instance, a substructure that has 3 `atoms_available`, each of their bond types being single bonds, would have - a total degree of 3. + substructures = BRICS.BRICSDecompose(record_dict["mol"]) + sgs = [] + for substructure in substructures: + substructure = Chem.DeleteSubstructs(Chem.MolFromSmiles(substructure), Chem.MolFromSmarts('[#0]')) + edge_idxs = get_substructure_bond_idx(substructure, record_dict["mol"]) - :param isomeric_smiles: If True, generates database entries using smiles with non-structural isomeric information. - """ + if edge_idxs is not None: + sgs.append(edge_idxs) - if lib is None: - return + return subset_sgs_sizes(sgs=[sgs], n_min=n_min, n_max=n_max) - if max_atoms_available is not None: - if lib["atoms_available"] > max_atoms_available: - return - if max_degree is not None: - if lib["valence"] > max_degree: - return +def calculate_hydrogen_rearrangements(fragment_ions, ion_mode): + """ + Calculate MS-FINDER re-arrangement possibilities. - smiles_rdkit = Chem.MolToSmiles(lib["mol"], isomericSmiles=isomeric_smiles) # canonical rdkit smiles + :param fragment_ions: The element symbol of the assumed ionised atom. - exact_mass = calculate_exact_mass(lib["mol"]) - els = get_elements(lib["mol"]) + :param ion_mode: If True, assumes positive ion mode, else is configured for negative ion mode. - sub_smi_dict = {'smiles': smiles_rdkit, - 'exact_mass': exact_mass, - 'length': sum([els[atom] for atom in els if atom != "*"]), - "valence": lib["valence"], - "valence_atoms": str(lib["degree_atoms"]), - "atoms_available": lib["atoms_available"], - "mol": lib["mol"].ToBinary(), - "bond_types": str(lib["bond_types"]), - "dummies": str(lib["dummies"])} + :return: A set of integers referring to likely hydrogenation modifiers; i.e. if the set contains 1, this means that + the substructure could have one less hydrogen than expected. + """ - sub_smi_dict["exact_mass__1"] = round(sub_smi_dict["exact_mass"], 0) - sub_smi_dict["exact_mass__0_0001"] = round(sub_smi_dict["exact_mass"], 4) + which_rule = {"+": {True: {"C": ["P1"], "N": ["P2"], "O": ["P2"], "P": ["P1", "P2"], "S": ["P1", "P2"]}, + False: {"C": ["P3", "P4"], "N": ["P3", "P4"], "O": ["P3", "P4"], "P": ["P3", "P4"], "S": ["P3", "P4"]}}, + "-": {True: {"C": ["N1", "N2"], "N": ["N1"], "O": ["N1"], "P": ["N2"], "S": ["N1", "N3"]}, + False: {"C": ["N4", "N5"], "N": ["N4", "N5"], "O": ["N4", "N5"], "P": ["N4", "N5"], "S": ["N4", "N5"]}} + } - sub_smi_dict.update(els) - sub_smi_dict["heavy_atoms"] = sum([els[atom] for atom in els if atom != "H" and atom != "*"]) + fragment_ion_rules = [] + for i, fragment_ion in enumerate(fragment_ions): - cursor.execute("""INSERT OR IGNORE INTO substructures ( - smiles, - heavy_atoms, - length, - exact_mass__1, - exact_mass__0_0001, - exact_mass, - C, - H, - N, - O, - P, - S, - valence, - valence_atoms, - atoms_available, - bond_types, - dummies, - mol) - values ( - :smiles, - :heavy_atoms, - :length, - :exact_mass__1, - :exact_mass__0_0001, - :exact_mass, - :C, - :H, - :N, - :O, - :P, - :S, - :valence, - :valence_atoms, - :atoms_available, - :bond_types, - :dummies, - :mol)""", sub_smi_dict) + fragment_ion_rules.append(which_rule[ion_mode][i == 0][fragment_ion[0]]) - if not substructures_only: - cursor.execute("SELECT substructure_id FROM substructures WHERE smiles = '%s'" % sub_smi_dict["smiles"]) + if fragment_ion[1]: # double bond (i.e. two hydrogens displaced) + fragment_ion_rules.append(which_rule[ion_mode][False][fragment_ion[0]]) - cursor.execute("""INSERT OR IGNORE INTO hmdbid_substructures ( - hmdbid, - substructure_id) - VALUES ('{}', {})""".format(record_dict['HMDB_ID'], cursor.fetchall()[0][0])) + return get_hydrogenation_modifiers(fragment_ion_rules) -def create_connectivity_database(path_connectivity_db: Union[str, bytes, os.PathLike], max_n_substructures: int = 3, - max_atoms_available: int = 2, path_ri: Union[str, bytes, os.PathLike, None] = None - ) -> None: +def get_hydrogenation_modifiers(rules_list): """ - Generates a connectivity database containing sets of possible combinations of substructures; these combinations are - represented by graphs whose vertices correspond to substructures and edges to bonds. We use geng, part of the nauty - package, along with RI3.6 to ensure that the generated graphs are non-isomorphic - i.e. we only generate each - combination of substructures once. These graphs are pickled in order to be stored in the final column of the - SQLite 3 connectivity database. + Convert rule names to hydrogen modifiers. - :param path_connectivity_db: The path at which to generate the SQLite 3 database. + :param: A list of list of rules. E.g. `[["P1"], ["P3", P4"]]`. - :param max_n_substructures: The maximal number of substructures (vertices). At least two substructures must be - available for bonding for a graph to be created. + :return: A set of integers referring to likely hydrogenation modifiers; i.e. if the set contains 1, this means that + the substructure could have one less hydrogen than expected. If the example above (`[["P1"], ["P3", P4"]]`) + was given as input, the output would be a set containing + `sum([rule_hydrogenations["P1"], rule_hydrogenations["P3"]])` and + `sum([rule_hydrogenations["P1"], rule_hydrogenations["P4"]])`. + """ - :param max_atoms_available: The maximum number of atoms available of each substructure to be considered for - building molecules. `atoms_available` refers to the number of atoms on a substructure involved in forming - chemical bonds (e.g. single or double bonds). + # the rules modified to account for the fact we are using the neutralised peak mass + rule_hydrogenations = {"P1": -1, "P2": +1, "P3": +1, "P4": -1, + "N1": +1, "N2": -1, "N3": +0, "N4": +1, "N5": -1} - :param path_ri: The path of RI, a required tool for verifying subgraph isomorphism. - """ - - conn = sqlite3.connect(path_connectivity_db) - cursor = conn.cursor() + possible_hydrogenations = set() + for rule_set in itertools.product(*rules_list): - cursor.execute("""DROP TABLE IF EXISTS subgraphs""") - cursor.execute("""CREATE TABLE subgraphs ( - id_pkl INTEGER, - n_graphs INTEGER, - graph6 TEXT, - k INTEGER, - k_partite TEXT, - k_valences TEXT, - nodes_valences TEXT, - n_nodes INTEGER, - n_edges INTEGER, - root BLOB, - PRIMARY KEY (graph6, k_partite, nodes_valences) - );""") - conn.commit() + # note that we don't consider dummy atoms ("*") to have mass, so we do not have to account for these + # in the hydrogenation modifier (as MS-FINDER do) + possible_hydrogenations.add(sum([rule_hydrogenations[rule] for rule in rule_set])) - id_pkl = 0 - - for g, p in calculate_complete_multipartite_graphs(max_atoms_available, max_n_substructures): - - # get complete set of non-isomorphic graphs, using geng, from a distinct multipartite graph as input - proc = subprocess.Popen(["geng", str(len(g.nodes)), "-d1", "-D2", "-q"], stdout=subprocess.PIPE, - stderr=subprocess.PIPE) # max valence for single atom of 2 - geng_out, err = proc.communicate() - - proc.stdout.close() - proc.stderr.close() - - # pipe geng output to RI to generate mappings (complete set of non-isomorphic configurations) - for i, line_geng in enumerate(geng_out.split()): - - s_g = nx.read_graph6(io.BytesIO(line_geng)) - - k_gfu = tempfile.NamedTemporaryFile(mode="w", delete=False) - k_gfu.write(graph_to_ri(g, "k_graph")) - k_gfu.seek(0) - - s_gfu = tempfile.NamedTemporaryFile(mode="w", delete=False) - s_gfu.write(graph_to_ri(s_g, "subgraph")) - s_gfu.seek(0) - - proc = subprocess.Popen([path_ri, "mono", "geu", k_gfu.name, s_gfu.name], stdout=subprocess.PIPE, - stderr=subprocess.PIPE) # TODO: add ri as dependency - ri_out, err = proc.communicate() - - k_gfu.close() - s_gfu.close() - - mappings = [] - subgraphs = {} - - for line in ri_out.decode("utf-8").splitlines(): - if line[0] == "{": - mappings.append(eval(line)) - - if len(mappings) > 0: - gi = graph_info(p, s_g, mappings, ) # convert mappings to valence/connectivity specifications - - for vn in gi: - if vn not in subgraphs: - subgraphs[vn] = gi[vn] - - else: - for es in gi[vn]: - if es not in subgraphs[vn]: - subgraphs[vn].append(es) - - if len(subgraphs) > 0: - for vn in subgraphs: # for each valence configuration - subgraphs[vn] = sort_subgraphs(subgraphs[vn]) # sort to remove duplicate configurations - root = {} # graph to be pickled - - for fr in subgraphs[vn]: - parent = root - for e in fr: - parent = parent.setdefault(e, {}) - - vt = tuple([sum(v) for v in eval(vn)]) - - id_pkl += 1 - cursor.execute("""INSERT INTO subgraphs ( - id_pkl, - n_graphs, - graph6, - k, - k_partite, - k_valences, - nodes_valences, - n_nodes, n_edges, - root) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", ( - id_pkl, - len(subgraphs[vn]), - line_geng, - len(p), - str(p), - str(vt), - str(vn), - s_g.number_of_nodes(), - s_g.number_of_edges(), - pickle.dumps(root))) - - conn.commit() - conn.close() + return possible_hydrogenations diff --git a/metaboblend/parse.py b/metaboblend/parse.py index 5495a58..c3260f9 100644 --- a/metaboblend/parse.py +++ b/metaboblend/parse.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright © 2019-2020 Ralf Weber +# Copyright © 2019-2020 Jack Gisby, Ralf Weber # # This file is part of MetaboBlend. # @@ -70,7 +70,7 @@ def parse_ms_data(ms_data, msn=True): yield from parse_msp(ms_data) -def precursor_ion_to_neutral_mass(mass, precursor_type): +def precursor_ion_to_neutral_mass(mass, precursor_type, get_mode=False): """ Convert precursor ion to predicted neutral mass for substructure searching. @@ -78,6 +78,8 @@ def precursor_ion_to_neutral_mass(mass, precursor_type): :param precursor_type: Type of precursor ion. + :param get_mode: If True, also return the ion mode (positive or negative). + :return: Neutral mass. """ @@ -91,7 +93,13 @@ def precursor_ion_to_neutral_mass(mass, precursor_type): "[M+K-2H]-": 36.948605, "[M+Hac-H]-": 59.013853} - return mass - precursor_dict[precursor_type] + precursor_mode = {"[M+H]+": "+", "[M+Na]+": "+", "[M+K]+": "+", + "[M-H]-": "-", "[M+Cl]-": "-", "[M+Na-2H]-": "-", "[M+K-2H]-": "-", "[M+Hac-H]-": "-"} + + if get_mode: + return mass - precursor_dict[precursor_type], precursor_mode[precursor_type] + else: + return mass - precursor_dict[precursor_type] def precursor_ions_to_neutral_masses(ms_dict, which="both"): @@ -108,8 +116,9 @@ def precursor_ions_to_neutral_masses(ms_dict, which="both"): """ if which == "precursor" or which == "both": - ms_dict["exact_mass"] = precursor_ion_to_neutral_mass(ms_dict["precursor_mz"], - ms_dict["precursor_type"]) + ms_dict["exact_mass"], ms_dict["ion_mode"] = precursor_ion_to_neutral_mass(ms_dict["precursor_mz"], + ms_dict["precursor_type"], + get_mode=True) if which == "fragments" or which == "both": @@ -170,7 +179,7 @@ def parse_msp(msp_path): re_query = re.search(meta_re, line, re.IGNORECASE) - if re_query: # TODO: walrus + if re_query: entry_dict[meta_type] = re_query.group(1).strip() if re.match("^Num Peaks(.*)$", line, re.IGNORECASE) or re.match("^PEAK:(.*)", line, re.IGNORECASE): diff --git a/metaboblend/results.py b/metaboblend/results.py deleted file mode 100644 index 898443f..0000000 --- a/metaboblend/results.py +++ /dev/null @@ -1,299 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright © 2019-2020 Ralf Weber -# -# This file is part of MetaboBlend. -# -# MetaboBlend is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# MetaboBlend is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with MetaboBlend. If not, see . -# - -import os -import csv -import sqlite3 - - -class ResultsDb: - """ - Methods for interacting with the SQLITE3 results database, as created by - :py:meth:`metaboblend.build_structures.annotate_msn`. - - :param path_results: Directory to which results will be written. - """ - - def __init__(self, path_results, msn=True): - """Constructor method.""" - - self.path_results = path_results - self.path_results_db = os.path.join(self.path_results, "metaboblend_results.sqlite") - self.msn = msn - - self.conn = None - self.cursor = None - - self.substructure_combo_id = 0 - - def connect(self): - """ Connects to the results database. """ - - self.conn = sqlite3.connect(self.path_results_db) - self.cursor = self.conn.cursor() - - def create_results_db(self): - """ Generates a new results database. """ - - if os.path.exists(self.path_results_db): - os.remove(self.path_results_db) - - self.connect() - - self.cursor.execute("""CREATE TABLE queries ( - ms_id_num INTEGER PRIMARY KEY, - ms_id TEXT, - exact_mass NUMERIC, - C INTEGER, - H INTEGER, - N INTEGER, - O INTEGER, - P INTEGER, - S INTEGER, - ppm INTEGER, - ha_min INTEGER, - ha_max INTEGER, - max_atoms_available INTEGER, - max_degree INTEGER, - max_n_substructures INTEGER, - hydrogenation_allowance INTEGER, - isomeric_smiles INTEGER)""") - - if self.msn: - self.cursor.execute("""CREATE TABLE spectra ( - ms_id_num INTEGER, - fragment_id INTEGER, - neutral_mass NUMERIC, - PRIMARY KEY (ms_id_num, fragment_id))""") - - self.cursor.execute("""CREATE TABLE structures ( - ms_id_num INTEGER, - structure_smiles TEXT, - frequency INTEGER, - PRIMARY KEY (ms_id_num, structure_smiles))""") - - self.cursor.execute("""CREATE TABLE substructures ( - substructure_combo_id INTEGER, - substructure_position_id INTEGER, - ms_id_num INTEGER, - structure_smiles TEXT, - fragment_id INTEGER, - substructure_smiles TEXT, - bde INTEGER, - PRIMARY KEY (substructure_combo_id, substructure_position_id))""") - - self.cursor.execute("""CREATE TABLE results ( - ms_id_num INTEGER, - fragment_id INTEGER, - structure_smiles TEXT, - bde INTEGER, - PRIMARY KEY(ms_id_num, fragment_id, structure_smiles))""") - - self.conn.commit() - - def add_ms(self, msn_data, ms_id, ms_id_num, parameters): - """ - Add entries to the `queries` and `spectra` tables. - - :param msn_data: Dictionary in the form - `msn_data[id] = {mf: [C, H, N, O, P, S], exact_mass: float, fragment_masses: []}`. id represents a unique - identifier for a given spectral tree or fragmentation spectrum, mf is a list of integers referring to the - molecular formula of the structure of interest, exact_mass is the mass of this molecular formula to >=4d.p. - and fragment_masses are neutral fragment masses generated by this structure used to inform candidate - scoring. See :py:meth:`metaboblend.build_structures.annotate_msn`. - - :param ms_id: Unique identifier for the annotation of a single metabolite. - - :param ms_id_num: Unique numeric identifier for the annotation of a single metaoblite. - - :param parameters: List of parameters, in the form: [ppm, ha_min, ha_max, max_atoms_available, max_degree, - max_n_substructures, hydrogenation_allowance, isomeric_smiles]. See - :py:meth:`metaboblend.build_structures.annotate_msn`. - """ - - for i, parameter in enumerate(parameters): - if parameter is None: - parameters[i] = "NULL" - elif isinstance(parameter, bool): - parameters[i] = int(parameter) - - self.cursor.execute("""INSERT INTO queries ( - ms_id, - ms_id_num, - exact_mass, - C, H, N, O, P, S, - ppm, - ha_min, - ha_max, - max_atoms_available, - max_degree, - max_n_substructures, - hydrogenation_allowance, - isomeric_smiles - ) VALUES ('{}', {}, {}, '{}', '{}', '{}', '{}', '{}', '{}', {})""".format( - ms_id, - ms_id_num, - msn_data[ms_id]["exact_mass"], - msn_data[ms_id]["mf"][0], msn_data[ms_id]["mf"][1], - msn_data[ms_id]["mf"][2], msn_data[ms_id]["mf"][3], - msn_data[ms_id]["mf"][4], msn_data[ms_id]["mf"][5], - ", ".join([str(p) for p in parameters]) - )) - - self.conn.commit() - - def add_results(self, ms_id_num, smi_dict, fragment_mass=None, fragment_id=None, retain_substructures=False): - """ - Record which smiles were generated for a given fragment mass. - - :param ms_id_num: Unique identifier for the annotation of a single metabolite. - - :param smi_dict: The fragment and substructure smiles generated by the annotation of a single peak for a single - metabolite. - - :param fragment_mass: The neutral fragment mass that has been annotated. - - :param fragment_id: The unique identifier for the fragment mass that has been annotated. - - :param retain_substructures: If True, record substructures in the results DB. - """ - - if self.msn: - self.cursor.execute("""INSERT OR IGNORE INTO spectra ( - ms_id_num, - fragment_id, - neutral_mass - ) VALUES ('{}', {}, {})""".format( - ms_id_num, - fragment_id, - fragment_mass - )) - else: - fragment_id = "NULL" - - for structure_smiles in smi_dict.keys(): - - self.cursor.execute("""INSERT OR IGNORE INTO results ( - ms_id_num, - fragment_id, - structure_smiles, - bde - ) VALUES ({}, {}, '{}', {})""".format( - ms_id_num, - fragment_id, - structure_smiles, - min(smi_dict[structure_smiles]["bdes"]) - )) - - if retain_substructures: - for i in range(len(smi_dict[structure_smiles]["substructures"])): # for each combination - - for j, substructure in enumerate(smi_dict[structure_smiles]["substructures"][i]): - self.cursor.execute("""INSERT INTO substructures ( - substructure_combo_id, - substructure_position_id, - ms_id_num, - fragment_id, - structure_smiles, - substructure_smiles, - bde - ) VALUES ({}, {}, {}, {}, '{}', '{}', {})""".format( - self.substructure_combo_id, - j, - ms_id_num, - fragment_id, - structure_smiles, - substructure, - smi_dict[structure_smiles]["bdes"][i] - )) - - self.substructure_combo_id += 1 - - self.conn.commit() - - def calculate_frequencies(self, ms_id_num): - """ - Calculates structure frequencies in the SQLite DB. - - :param ms_id_num: Unique identifier for the annotation of a single metabolite. - """ - - self.cursor.execute("""INSERT INTO structures (ms_id_num, structure_smiles, frequency) - SELECT ms_id_num, structure_smiles, COUNT(*) - FROM results - WHERE ms_id_num = {} - GROUP BY structure_smiles""".format(ms_id_num)) - - def get_structures(self, ms_id_num): - """ - Gets smiles of generated structures. In the case of the MSn annotation workflow, also gets structure - frequencies. - - :param ms_id_num: Unique identifier for the annotation of a single metabolite. - - :return: In the case of simple structure generation, returns a set of smiles strings for output structures. - For the MSn annotation workflow, returns a dictionary with smiles as keys and the number of peaks for which - the smiles were generated as values. - """ - - if self.msn: - msn_str = ", frequency" - else: - msn_str = "" - - self.cursor.execute("""SELECT structure_smiles{} FROM structures - WHERE ms_id_num = {} - """.format(msn_str, ms_id_num)) - - if self.msn: - return [t for t in self.cursor.fetchall()] - else: - return [item for t in self.cursor.fetchall() for item in t] - - def generate_csv_output(self): - """ Generate CSV file output for i) queries and tool parameters and ii) structures generated. """ - - with open(os.path.join(self.path_results, "metaboblend_queries.csv"), "w", newline="") as results_file, \ - open(os.path.join(self.path_results, "metaboblend_structures.csv"), "w", newline="") as ms_file: - - results_writer = csv.writer(results_file, delimiter=",") - ms_writer = csv.writer(ms_file, delimiter=",") - - results_writer.writerow(["ms_id", "exact_mass", "C", "H", "N", "O", "P", "S", "ppm", "ha_min", "ha_max", - "max_atoms_available", "max_degree", "max_n_substructures", - "hydrogenation_allowance", "isomeric_smiles"]) - - self.cursor.execute("SELECT * FROM queries") - - for query in self.cursor.fetchall(): - results_writer.writerow(query) - - ms_writer.writerow(["ms_id", "smiles", "frequency", "exact_mass", "C", "H", "N", "O", "P", "S"]) - - self.cursor.execute("SELECT * FROM structures") - - for structure in self.cursor.fetchall(): - ms_writer.writerow(structure) - - def close(self): - """ Close the connection to the SQLITE3 database. """ - - self.conn.close() diff --git a/notebooks/workflow.ipynb b/notebooks/workflow.ipynb new file mode 100644 index 0000000..6c7db06 --- /dev/null +++ b/notebooks/workflow.ipynb @@ -0,0 +1,224 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# MetaboBlend\n", + "\n", + "## Example data\n", + "\n", + "For this example, we load D-Glucose (HMDB0000122) before generating a database of its\n", + "substructures; MetaboBlend leverages substructures of known endogenous compounds to\n", + "generate candidate structures for a given molecular composition and exact mass. We\n", + "can additionally annotate MSn spectra in order to rank candidate lists." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": "'C:\\\\Users\\\\jackg\\\\OneDrive\\\\Documents\\\\Work\\\\Final_Year_2019-2020\\\\LCMS_Dissertation\\\\Metaboverse\\\\metaboblend\\\\notebooks\\\\notebook_data'" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# standard imports\n", + "import os\n", + "import shutil\n", + "\n", + "# rdkit - manipulation of molecules\n", + "from rdkit import Chem\n", + "from rdkit.Chem.Draw import IPythonConsole\n", + "IPythonConsole.ipython_useSVG=True\n", + "\n", + "# metaboblend imports\n", + "from metaboblend.build_structures.annotate import generate_structures, annotate_msn\n", + "\n", + "# extract test data\n", + "test_data = os.path.realpath(os.path.join(\"..\", \"tests\", \"test_data\"))\n", + "notebook_data = os.path.realpath(os.path.join(\"notebook_data\"))\n", + "\n", + "shutil.copytree(test_data, notebook_data)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Database generation\n", + "\n", + "MetaboBlend requires a connectivity and substructure database in order to\n", + "propose candidate structures. Combinations of substructures are considered\n", + "by MetaboBlend to propose candidate structures for a metabolite of interest.\n", + "A standard connectivity database is bundled with the package and a small\n", + "substructure database has been pre-generated for this example." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Standard structure generation\n", + "\n", + "Candidate structures can be generated for a given molecular formula and\n", + "exact mass from the generated databases. Since this method doesn't use\n", + "MS/MS information, many unique structures can be generated and candidates\n", + "are not assigned a relative score.\n", + "\n", + "The function yields a set of smiles for each compound in ms_data\n", + "(in this case, only HMDB0000122). After extracting smiles, RDKit can be\n", + "used to visualise generated structures." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [ + { + "data": { + "text/plain": "", + "image/svg+xml": "\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nOH\nO\nOH\nOH\nOH\nOH\n\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAABmJLR0QA/wD/AP+gvaeTAAAX5ElEQVR4nO3deVRTd/oG8CdhCQiyJYMFqkUguABWsGpNKaNWseBudVzaAyiWulUZ64Zi3bCKoxa1HZeW49JxbK0dl/Ek49ZqrYg/FWXzjE1AUQuVEUQRAoTk/v4IBQUXBJJvbvJ+DsdjbgL3yVGe8958b3IFHMeBEEJISwlZByCEEH6jGiWEkFahGiUEUKkgEDR8Ndre9JGEPIZqlFg8hQJSKeRycBw4DnI5BAKoVKxjEd6gGiUWLykJcjkiIupuRkQgJQXR0UwzET4R0Eo9sWgqFaRSNPot0G9UKgE0vvepjyeWzZp1AEKYUiohkzXe6OdXd5dUCoBeDCXPRwf1hLyI/jVT/Zd+RCXkMVSjxLJJpUhLe+ZdhDQD1SixbH5+kMmgUDyxcdMmyGR1h/aEvAjVKLF4iYmIjGxoUoUC8fHYvZtpJsIntMRELF5EBDjuiXUkWognL4NOeCLkD3l56NoVIhHu3IGLC+s0hDfooJ6QPyQno7YWEyZQh5KXQtMoIQCAO3fg6wutFteuwd+fdRrCJzSNEgIAWLcONTWYMIE6lLwsmkYJAe7eRefOqKpCVhYCA1mnITxD0yghwIYNUKsxejR1KGkBmkaJxSsthbc3ystx8SLeeIN1GsI/NI0Si5eSgvJyREZSh5KWoWmUWLaHD+Htjfv3cfYsQkNZpyG8RNMosWxffIH79zFwIHUoaTGaRokFq6xE584oLsapUxg4kHUawlc0jRILtm0biovRty91KGkNmkaJpaquhq8vfvsNR49i6FDWaQiP0TRKLFT6d9/ViEQIDkZkJOsshN9oGiWWSKPR+Pv737l168KhQyHDh7OOQ/iNplFiif7xj3/cvHlT2qVLTzqcJ61GNUosjlarTU5OBrBkyRKhkH4FSGvR/yFicfbv33/9+nVfX9/x48ezzkLMAdUosSwcx61ZswZAQkKCtTVdRIe0AVpiIpbl4MGDY8aM6dixo0qlsrW1ZR2HmAOaRoll0Y+iCxYsoA4lbYWmUWJBFApFZGRkhw4dbty4YW9vzzoOMRM0jRIL8tlnnwGYN28edShpQzSNEktx+vTpAQMGiMXiGzdutG/fnnUcYj5oGiWWYvXq1QDi4+OpQ0nbommUWIQLFy68+eabTk5OBQUFLnQZetKmaBol5o/juBUrVgCYPXs2dShpczSNEuNSqSCVNtxs3X8/tVp9v4mioqLCwsLHt/zvf/9zcHCoqqq6c+eORCJp7VMg5En0Lg5iRAoFIiMhlyMiou6mQAClEn5+TzysqgolJSgtbfjz3j2Ulj6+5X0Pj+9//lmj0TRzz1qttrq6+scff/zLX/7S1s+KWDqaRokRvfUWEhPrOlRv0ybs34+jRzF2bENpVla+8CdFv/XWnnPn7OzsXJ/G09PTw8Oj/qa7u3tqauq0adPEYnFOTs4rr7xiwOdILA/VKDEW/eF8o/9v+o05OQgMbNhoZwc3N7i5QSyu+7P+64+NlRKJtatr89+JxHHc0KFDFQrF8OHDjxw50nbPihCqUWI0CgWSknDuXOPtAgHkctjZwdW1rigdHAyx/8LCwsDAwPv37+/cuTMmJsYQuyCWiVbqWVOpIBA0fDXa3vSR5mrAAPTsiY4dDdShADw9PVNSUgDEx8ffunXLQHshFohqlCmFAlIp5HJwHDgOcjkEAqhUrGMZhlSKtLRn3mUUUVFR77333oMHD6ZMmULHYaStUI0ylZTUsGwNICICKSmIjmaayWD8/CCTQaF4YuOmTZDJGq/UG9LWrVs7dOhw6tSpbdu2GW2nxLzRa6PsPGfJRakE0Pjepz6eX5qe8BQZ+ZQTngzs8OHDo0aNcnBwuHLlitRYgzAxYzSNsqNUQiZrvFFfKPoaBZ542ZTvv/Dl5fjPf5Cfj8jIumcUGQmOM3KHAhg5cuTEiRMrKipiYmK0Wq2R907MD9WoadO/Zqr/qu9Wntq6FZs3Izb2iSfFyN///vdXX301LS1Nv+hESGtQjbJjAksuxlNVBX1hLVrEOgoAuLi4pKamCgSCJUuW5Obmso5D+I1qlB3TWHIxku3bUVSEPn0QHs46Sp3w8PApU6ZUV1dHRUU1/02lhDRFNcpUYiIiIxuaVKFAfDx272aayQA0GmzcCABLlrCO8oSNGze+9tprGRkZ+svWE9IytFJvAh4/qb7+n6N+Xb6mBrdvw9eXxyv1O3bgo4/w+uu4csXU3kHw448/Dho0yMrKKj09vVevXqzjEF6iadQEPHXJxc8PHIfffkPfvhgyBBUVdVt4R6vF+vUAsHixqXUogIEDB86cObO2tjYqKqqqqop1HMJLVKOmrUMHWFsjLw8LFrCO0lJ790KpRNeuGDuWdZSnS05O9vf3v3bt2qpVq1hnIbxEB/Um79o19OqF6mrI5Xj3XdZpXpJOhx49kJuL3bsRFcU6zTOdP3/+7bffFggEv/zyS9++fVnHITxD06jJ694dS5eC4zB1Ku7fZ53mJR04gNxc+Phg0iTWUZ6nX79+n3zySW1tbXR0tFqtZh2H8AzVKB8sWoTQUPz2G+bOZR3lZXAc9CvgCxfC2tSvs7By5cqgoKDr168vMbHTCYjpo4N6nrh+HcHBUKvxww8YM4Z1muY5cgQjR+LVV6FSQSRinebFrly50rdvX61We+rUqf79+7OOQ3iDplGe6NIFn30GANOmobiYdZrm0Y+i8+bxokMBBAcHL168WKfTTZ48uby8nHUcwhs0jfKHTod33sHp0xg9Gv/6F+s0L3L8OIYMgbs7btxAu3as0zRXbW2tTCa7ePHiRx99RJ+kR5qJplH+EAqxcyfat8fBg/juO9ZpXuCbffuq/Pwwdy6POhSAtbX17t277ezsduzYoWj0Pl1CnoGmUb7Zvh3TpkEsRk4OTPUKl2fPng0LC5OIxbfz8+2cnFjHeWnJycmLFi3y9PTMyclxdXVlHYeYOppG+SYuDhERKCnBhx+yjvJMq1evBjDr44/52KEA5s+f//bbbxcWFsbHxxtjf3Q9Lp6jGuUbgQBffw03Nxw9il27WKd5ioyMjOPHjzs5Oc2ePZt1lhYSCoW7du1ydHTcs2fPDz/80PY7qK0FAIUChw5Z1vW4zBTVKA95eiIlRScUbvn229u3b7NO09jKlSs5jps5cyavD4d9fHz0M/X06dOL2+TUCLUaX30FACdPYsgQAFCpcPKkZV2Py1xxhJ+WTZsGYPDgwTqdjnWWBjk5OUKhsF27dnfv3mWdpbV0Ol14eDiAUaNGteT7s7K44mKO47h33uFu3eIqKzmRiOM4LiOD69mT4zhu715u2DCu6e+gUskBnFJZ95emdxETQ9MoX81atapDhw4nTpzYunUr6ywNVq1apdPp4uLi3N3dWWdpLYFAsGPHDicnp0OHDu3bt+/5D9ZoNCUlJQAOHjz4zTffAMDatTh2DABKS1FcDHt7CIWorIRYjJISABCLUVhoWdfjMlNUo3wlkUi2b98OYP78+UrTuEyTSqU6cOCASCSaN28e6yxt47XXXtuwYQOAGTNm3Llzp+kDKisr16xZA+Ds2bPjxo0DUFhYeP78eQAQi1Fa2vgvJSVwc2uo0YcPXxzCnK7HZaaoRnls5MiRkyZNqqysNJErXCYlJWm12smTJ3t5ebHO0mZiY2MjIiLKysqSkpIAnDt3Tv969BtvvHHjxg0rK6vly5cDEIvF+mlULBaX6kvTze2J9tRvKSmBoyO0WlRVQSyGWm1B1+MyX1Sj/Pbll1+ayBUub926tW/fPhsbmwX8/WjUpxEIBHFxcSKRqGPHjgAePXpUU1MDYNSoUSKRSCQShYWFVVVVSSQSnU6Hx/q0Yep8fPx8vGHd3PDwoQVdj8uMsX5xlrTW8ePHBQKBSCTKzs5mGGPatGkAYmJiGGYwkCFDhgBYtmxZcx6sVqtLS0s5juPu3OFyczmO406f5i5f5jiO27yZy8jgOI4bN467eZPT6ThnZ+7IEQ7g5PK675fL69aXuKctKNESk0midzGZg6lTp6ampoaEhKSnp9vY2Bg/QFFRkY+Pj0ajyc3N7dKli/EDGM6VK1d69erl4OBw48YNiURiqN08/3pc9fh7PS6zZuqfAkmaIyUl5fTp0xkZGWvXrl26dGlmZuajR4/s7OwA6E/etLe3t7OzEwgELi4uhgiwbt26qqqqCRMmmFmH4o/TYGfNmmXADgWe3oxNr77F0+txmTuaRs1E/RUu//a3v82fP79W/z6ZZ2jUqu3atROJREKh0NnZuelNBwcHW1vbRjetrKycnJz0N9VqdVRUlFqtvnr1ao8ePYzxbI3l2rVrQUFBtra2+fn5Hh4erOMQE0U1aj5mz569ZcuWhiUOIxKLxV5eXpmZmUber6FNnDjx22+/jY+P//zzz1lnIaaLatR8VFZWBgcH//rrr0ber5OTU3l5ubW19YULF4KDg428d8NRqVRdu3YVCoUqlapTp06s4xDTRSc8mY927drt3LlTKDTsv6mNjU337t3HjRu3bNmyI0eO5OXllZWVffzxxxqN5oMPPjCnS72vXr1aq9VOmTKFOpQ8H02j5mb06NGHDh1qwx/o4eHRq1evgICA7t27BwQEBAYGippcFKR+EE5ISPhMf7ETnrt165ZUKtVqtf/973/96BRO8lxUo+bmp59+GjhwYIu/3dXVtXv37vW9GRwc7ODg0JxvTE9PDw0N5TjuzJkzoaGhLQ5gIqZPn75t27bo6OhdJvlphMSkUI2am+zs7OYvlzs5OUmlUv2Y2b179969e7/Sik/UX7RoUXJysq+v79WrVx0dHVv8c5j7/ffffXx8qqurs7KyAgICWMchpo7OGzU3YrH4WXfZ2tp269YtICAgKCgoKCgoICDA29u7DXe9YsUKuVyenZ2dmJjI/M2prbFu3Tq1Wj1+/HjqUNIcNI2aG41G079//7S0NKFQ2Llz58DAwMDAwKCgoMDAQH9/f0O/x+nq1at9+/atra09efLkgAEDDLovAykpKfH29q6oqLh06VJISAjrOIQHqEbNkEajyczM7NatWzNf1mxbK1asWL58ube3d1ZWVvv27Y0foJUWL168Zs2akSNHtu1KHTFjVKOkjdVf6j0uLk7/iag88uDBA29v77KysrS0tH79+rGOQ/iBzhslbezxS73L5XLWcV5OSkpKWVlZeHg4dShpPppGiUGsW7du4cKFnp6e2dnZbm5urOM0S0VFhbe39717986cORMWFsY6DuENmkaJQcybNy8sLKywsHDOnDmsszTXF198ce/ePZlMRh1KXgpNo8RQ8vPzX3/99UePHn3//fdjx45lHecFqqqqfHx8ioqKjh07pr8gKCHNRNMoMRQfHx/95d6mT59+9+5d1nFeYMeOHUVFRSEhIYMHD2adhfAMTaPEgDiOi4iIOHbsmImfP6TRaKRSaUFBweHDh0eMGME6DuEZmkaJAQkEgtTUVBcXl8OHD+/du5d1nGfauXNnQUFBQEDAsGHDWGch/EM1SgzLy8tr/fr1AGbNmqW/NLGp0Wq1+oSffvqpoT9mkJgl+k9DDC42Nnb06NFlZWWxsbEm+CLS3r17lUqln5/fe++9xzoL4SWqUWIMX375pVgsPnHiRGpqKussT9DpdOvWrQOQmJhoZWXFOg7hJVpiIkayb9++SZMmOTg4ZGZm+vr6so5TZ//+/ePHj+/UqZNKpWJybWpiBmgaJUYyceLEcePGVVRUxMTE6HQ61nHqJCcnA1i8eDF1KGkxmkaJ8dy7dy8oKOj333/ftGnT7NmzWcfBv//97xEjRnh4eOTn59vZ2bGOQ/iKplFiPBKJRP+ZTwkJCca/gmlTa9euBbBw4ULqUNIaNI0SY4uOjt6zZ8+bb775yy+/MFzVOXHiRHh4uEQiuXnzJpMPZiVmg6ZRYmybN2/u1KlTenq6/mxNVlavXg1g3rx51KGklWgaJQycPHkyPDzc1tb24sWLQUFBxg9w/vx5mUzm7OxcUFDg7Oxs/ADEnNA0ShgYNGhQXFxcdXV1VFRUTU2N8QOsWLECQHx8PHUoaT2aRgkbFRUVPXv2VKlUy5YtW758uSF2UVtbW1paWlJSUlpaWv+XkpKSX3/99cCBA46OjgUFBXz5SGliyqhGCTPnzp3785//LBAI0tLSevfu3fxvVKvV959UVFRUWFjYaGNxcbFWq33qT/Dy8hIKhXl5eXS6KGk9qlHC0ty5cz///PNu3bplZGTodLr6gbHeU8fJZ5VjI1ZWVm5ubmKxuNGfjo6OGzZsuH379qeffqo/uiekNahGCUtqtTokJOT69es2NjbNf5HU2dlZ/If6cmzUlRKJxMXF5Vk/ocWDMCFNUY0Sxi5dupSWljZnzhw7OzvXZ/P09PTw8HB1dZVIJLa2tq3f7yeffLJx48Zu3bpdvnzZ3t6+9T+QWCyqUcKeVqutqakxcpdVV1f36tUrNzd3/vz5+g95IqRl6IQnwpJWq92wYYOVlZXx50GRSLRnzx4bG5sNGzb8/PPPRt47MSdUo4Slmpqa4OBgVnsPCQlZsGCBTqeLiYl59OgRqxiE7+ignjCj0+mYX7RDo9H069fv8uXLs2bN2rJlC9swhKeoRgkz48aN8/X11X/MEkOZmZl9+vTRaDQKhWLIkCFswxA+oholzJSWll67di00NJR1ECQlJS1dutTLyysnJ+c5p0kR8lRUo4SBwsLC6urqzp07sw5Sp7a2NjQ09MKFC7GxsV9//TXrOIRnaImJMHDp0qU+ffosXLiQdZA61tbWu3btsre3T01NPXjwIOs4hGeoRgkDI0aMUKlUJnVB465duyYlJQFQbt2K0lLWcQif0EE9MaqHDx8mJCT89a9/9fPzY52lMZ1OlzV1as9duzBhAv75T9ZxCG/QNEqMSiAQuLu7y2Qy0zmirycUCnsuXQpHR+zbh++/Zx2H8AZNo8R4ampq9G+HLy8vV6lUDE+8f56tWzFjBiQSZGfjlVdYpyE8QDVKjEcmk/n7+yckJHTp0oV1lmfjOAwdCoUCI0bg8GHWaQgP0EE9MR6FQiGVSsPCwkzwiL6BQICvvoKrK44cwZ49rNMQHqBplBhFbi5EIvj5AaioqLh582ZAQADrTM+1axcmT4azM7Ky0KkT6zTEpNE0Sozi6lXIZPjgA+TmOjg4mHqHAoiJwZgxePAAsbGgUYM8F9UoMYr330deHoKCMGgQFi1inaZ5tm2DuztOnsSOHayjEJNGB/XEwPLysGoVEhKgX1ZSq3H7Nvz9WcdqnkOHMHo0HBxw9SpM70RXYiJoGiUG9qc/QSpFWBgmTEBWFuztedOhAEaNwvjxqKhATAyadx09YoGoRokhPXwIJycsWYL8fPTpg4gI3hzR19u2DV5eOHcOmzezjkJMFB3UE4PRatGtG3r3xuLF0K8pVVWhsBA+PqyTvaSjRzF8OEQiXL4M018cI0ZH0ygxGCsrZGSgTx8MGYLhw/F//wc7O/51KIBhwzB5Mnr0gI0N6yjEFNE0SgzjzBl06ICuXQFArcZXX2H9ekyaBNafdd9ClZWwtYW1NescxBTRNEoMQ6lE//4YPx6ZmbC3x+zZUKkwYwbrWC3Vrh11KHkWqlFiGFOnIi8PMhmGDsXgwUhPh60tL98OpFJBIGj4arS96SOJ5aEaJW3tp5/w/vvIyYGDA+bMgUqF0aMxYQKWLGGd7OUpFJBKIZeD48BxkMshEEClYh2LmBaqUdLWevdGnz54992GZaUZM6BUYuZM1sleXlIS5HJERNTdjIhASgqio5lmIiaHlpiIYVRXY/duJCWhUyesXImBA1kHenkqFaTSxm+o129UKgE0vvepjycWgKZRYhgiEeLioFIhOhoffojERNaBXp5SCZms8Ub9W0L1NQo88bKpVGrUeMRkUI2S1nn+CoytLT78ENev163Um98KjP41U/1XfbcSC0M1SlqhmSsw1tZwd2eRr3WkUqSlPfMuQv5ANUpawbxXYPz8IJNBoXhi46ZNkMno057I46hGSUupVEhLa+hQvaFDkZZmPqcEJSYiMrKhSRUKxMdj926mmYjJoTdmkJZ6/gqM/rCX7y+GRkSA4554FrQQT5qgaZQYknmswDz+LOr5+TWu1KZbiGWgGiUtRSswhACgGiUtRyswhACgGiWtQiswhNASE2kVWoEhhN5TTwghrfT/P1hR+k05rGQAAAAASUVORK5CYII=\n" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ms_data = {\"HMDB0000122\": {\"mc\": [6, 12, 0, 6, 0, 0], \"exact_mass\": 180.063388116}}\n", + "\n", + "struct_generator = generate_structures(ms_data,\n", + " heavy_atoms=range(2, 13), max_valence=6,\n", + " max_atoms_available=2, max_n_substructures=3,\n", + " path_connectivity_db=os.path.join(notebook_data, \"connectivity.sqlite\"),\n", + " path_substructure_db=os.path.join(notebook_data, \"substructures.sqlite\"))\n", + "\n", + "# convert generated sets of smiles to rdkit.Chem.Mol objects\n", + "generated_mols = [Chem.MolFromSmiles(smiles) for i, smiles in enumerate(list(struct_generator)[0])]\n", + "generated_mols[0]" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## MS/MS guided structure generation\n", + "\n", + "MSn spectra provide a means of scoring candidates and limiting\n", + "the generation of unrealistic structures. A list of neutral\n", + "masses can be provided with the molecular composition and exact\n", + "mass of a metabolite of interest to inform the annotation process.\n", + "These masses may have been calculated from the fragment ions of\n", + "an MS2 spectrum; in this example, the masses of known substructures\n", + "have been chosen.\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total structures generated: 47\n", + "Number of structures generated by all peaks: 0\n", + "2\n" + ] + }, + { + "data": { + "text/plain": "", + "image/svg+xml": "\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nOH\nO\nOH\nOH\nOH\nOH\n\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAABmJLR0QA/wD/AP+gvaeTAAAYQ0lEQVR4nO3deVRTZ/oH8G8ASRBkSwYLVkUgbkArWpdmLFWrKNiqtDIu7QHcqFarVLGK4oJgFaa22GW0TvlZ2nFsrR2X9iTj1mpVpKODC+ApJYAgheoIgghhS+7vjyA7ypa8ucnzOTmc5L2B+80BnvPcvPfmFXAcB0IIIV1lxjoAIYTwG5VRQgjpFiqjhABKJQSCxluL8dbPJKQJKqPE5CkUkEohl4PjwHGQyyEQQKlkHYvwBpVRYvJiYyGXw9+//qG/PxISEBLCNBPhEwHN1BOTplRCKkWL/wLtYFYWgJZb23w+MW0WrAMQwlRWFmSyloMeHvWbpFIA9GYoeTw6qCfkSbTvmWpv2haVkCaojBLTJpUiObndTYR0AJVRYto8PCCTQaFoNrh7N2Sy+kN7Qp6EyigxeVFRCAhorKQKBcLDkZTENBPhE5piIibP3x8c12weiSbiSWfQCU+EPJKdjaFDIRSioAD29qzTEN6gg3pCHomLQ10d5s6lGko6hbpRQgAABQVwd4dajZs3MXgw6zSET6gbJQQAEB+PmhrMnUs1lHQWdaOEAHfuYNAgVFXhxg14ebFOQ3iGulFCgF27oFIhMJBqKOkC6kaJySspgasrystx+TKee451GsI/1I0Sk5eQgPJyBARQDSVdQ90oMW0PHsDVFffv4/x5jB/POg3hJepGiWn75BPcv49Jk6iGki6jbpSYsMpKDBqEu3dx5gwmTWKdhvAVdaPEhO3di7t3MXYs1VDSHdSNElNVXQ13d/z+O374AdOns05DeIy6UWKiUr75pkYohI8PAgJYZyH8Rt0oMUW1tbWDBw8uyM//5ejRka+8wjoO4TfqRokp+sc//nHr1i3pkCEj6HCedBuVUWJy1Gp1XFwcgI0bN5qZ0b8A6S76GyIm59ChQ5mZme7u7nPmzGGdhRgDKqPEtHAct2PHDgCRkZEWFrSIDukBNMVETMuRI0deffXV/v37K5VKS0tL1nGIMaBulJgWbSv67rvvUg0lPYW6UWJCFApFQEBA3759c3NzraysWMchRoK6UWJC3nvvPQARERFUQ0kPom6UmIqzZ89OnDhRLBbn5ub26dOHdRxiPKgbJaZi+/btAMLDw6mGkp5F3SgxCb/88su4ceNsbW3z8vLsaRl60qOoGyXGj+O46OhoACtXrqQaSnocdaNEv5RKSKWND7v356dSqe63UlRUVFhY2HTkf//7n7W1dVVVVUFBgUQi6e5LIKQ5uoqD6JFCgYAAyOXw969/KBAgKwseHs2eVlWF4mKUlDR+vXcPJSVNR153dv72559ra2s7uGe1Wl1dXf3jjz/+5S9/6elXRUwddaNEj/78Z0RF1ddQrd27cegQfvgBs2c3Fs3Kyif+pJA///nLixdFIpFDW1xcXJydnRseOjk5JSYmLl26VCwWp6enP/XUUzp8jcT0UBkl+qI9nG/x96YdTE+Hl1fjoEgER0c4OkIsrv/acHs0WCmRWDg4dPxKJI7jpk+frlAoXnnllePHj/fcqyKEyijRG4UCsbG4eLHluEAAuRwiERwc6gultbUu9l9YWOjl5XX//v39+/eHhobqYhfENNFMPWtKJQSCxluL8dbPNFYTJ2LECPTvr6MaCsDFxSUhIQFAeHh4fn6+jvZCTBCVUaYUCkilkMvBceA4yOUQCKBUso6lG1IpkpPb3aQXwcHBr732WllZ2cKFC+k4jPQUKqNMxcY2TlsD8PdHQgJCQphm0hkPD8hkUCiaDe7eDZms5Uy9Lu3Zs6dv375nzpzZu3ev3nZKjBu9N8rOY6ZcsrIAtNza5vP5pfUJTwEBbZzwpGPHjh2bNWuWtbX11atXpfpqhIkRo26UnawsyGQtB7UFRVtGgWZvm/L9H768HP/+N3JyEBBQ/4oCAsBxeq6hAGbOnDlv3ryKiorQ0FC1Wq3nvRPjQ2XUsGnfM9XeGmorT+3Zg48+wqJFzV4UI3/729+efvrp5ORk7aQTId1BZZQdA5hy0Z+qKmgL1vr1rKMAgL29fWJiokAg2LhxY0ZGBus4hN+ojLJjGFMuevLZZygqwpgx8PNjHaWen5/fwoULq6urg4ODO35RKSGtURllKioKAQGNlVShQHg4kpKYZtKB2lp88AEAbNzIOkozH3zwwcCBA1NTU7XL1hPSNTRTbwCanlTf8OtomJevqcHt23B35/FM/b59ePNNPPssrl41tCsIfvzxx8mTJ5ubm6ekpIwaNYp1HMJL1I0agDanXDw8wHH4/XeMHYupU1FRUT/CO2o13n8fADZsMLQaCmDSpEnLly+vq6sLDg6uqqpiHYfwEpVRw9a3LywskJ2Nd99lHaWrDhxAVhaGDsXs2ayjtC0uLm7w4ME3b96MiYlhnYXwEh3UG7ybNzFqFKqrIZdj2jTWaTpJo8EzzyAjA0lJCA5mnaZdly5deuGFFwQCwYULF8aOHcs6DuEZ6kYN3vDh2LQJHIfFi3H/Pus0nXT4MDIy4OaG+fNZR3mc559/fs2aNXV1dSEhISqVinUcwjNURvlg/XqMH4/ff8fq1ayjdAbHQTsDvm4dLAx9nYVt27Z5e3tnZmZuNLDTCYjho4N6nsjMhI8PVCp89x1efZV1mo45fhwzZ+Lpp6FUQihknebJrl69OnbsWLVafebMmQkTJrCOQ3iDulGeGDIE770HAEuX4u5d1mk6RtuKRkTwooYC8PHx2bBhg0ajWbBgQXl5Oes4hDeoG+UPjQYvvYSzZxEYiH/9i3WaJzl5ElOnwskJubno3Zt1mo6qq6uTyWSXL19+88036ZP0SAdRN8ofZmbYvx99+uDIEXzzDes0T/DVwYNVHh5YvZpHNRSAhYVFUlKSSCTat2+fosV1uoS0g7pRvvnsMyxdCrEY6ekw1BUuz58/7+vrKxGLb+fkiGxtWcfptLi4uPXr17u4uKSnpzs4OLCOQwwddaN8ExYGf38UF2PJEtZR2rV9+3YAK95+m481FMDatWtfeOGFwsLC8PBwfeyP1uPiOSqjfCMQ4PPP4eiIH37AF1+wTtOG1NTUkydP2trarly5knWWLjIzM/viiy9sbGy+/PLL7777rud3UFcHAAoFjh41rfW4jBSVUR5ycUFCgsbM7OOvv759+zbrNC1t27aN47jly5fz+nDYzc1N21MvW7bsbo+cGqFS4e9/B4DTpzF1KgAolTh92rTW4zJWHOGnLUuXApgyZYpGo2GdpVF6erqZmVnv3r3v3LnDOkt3aTQaPz8/ALNmzerK99+4wd29y3Ec99JLXH4+V1nJCYUcx3GpqdyIERzHcQcOcC+/zLX+H8zK4gAuK6v+TutNxMBQN8pXK2Ji+vbte+rUqT179rDO0igmJkaj0YSFhTk5ObHO0l0CgWDfvn22trZHjx49ePDg459cW1tbXFwM4MiRI1999RUA7NyJEycAoKQEd+/CygpmZqishFiM4mIAEItRWGha63EZKSqjfCWRSD777DMAa9euzTKMZZqUSuXhw4eFQmFERATrLD1j4MCBu3btAvDWW28VFBS0fkJlZeWOHTsAnD9/PigoCEBhYeGlS5cAQCxGSUnLO8XFcHRsLKMPHjw5hDGtx2WkqIzy2MyZM+fPn19ZWWkgK1zGxsaq1eoFCxb069ePdZYes2jRIn9//9LS0tjYWAAXL17Uvh/93HPP5ebmmpubb926FYBYLNZ2o2KxuERbNB0dm1VP7UhxMWxsoFajqgpiMVQqE1qPy3hRGeW3Tz/91EBWuMzPzz948GCvXr3e5e9Ho7ZFIBCEhYUJhcL+/fsDePjwYU1NDYBZs2YJhUKhUOjr61tVVSWRSDQaDZrU08aus2n72bTCOjriwQMTWo/LiLF+c5Z018mTJwUCgVAoTEtLYxhj6dKlAEJDQxlm0JGpU6cC2LJlS0eerFKpSkpKOI7jCgq4jAyO47izZ7n//pfjOO6jj7jUVI7juKAg7tYtTqPh7Oy448c5gJPL679fLq+fX+LamlCiKSaDRFcxGYPFixcnJiaOHDkyJSWlV69e+g9QVFTk5uZWW1ubkZExZMgQ/QfQnatXr44aNcra2jo3N1cikehqN49fj6sBf9fjMmqG/imQpCMSEhLOnj2bmpq6c+fOTZs2Xb9+/eHDhyKRCID25E0rKyuRSCQQCOzt7XURID4+vqqqau7cuUZWQ/HoNNgVK1bosIYCbVfG1qtv8XQ9LmNH3aiRaFjh8q9//evatWvrtNfJtKNFVe3du7dQKDQzM7Ozs2v90Nra2tLSssVDc3NzW1tb7UOVShUcHKxSqa5du/bMM8/o49Xqy82bN729vS0tLXNycpydnVnHIQaKyqjxWLly5ccff9w4xaFHYrG4X79+169f1/N+dW3evHlff/11eHj4hx9+yDoLMVxURo1HZWWlj4/Pb7/9puf92tralpeXW1hY/PLLLz4+Pnreu+4olcqhQ4eamZkplcoBAwawjkMMF53wZDx69+69f/9+MzPd/k579eo1fPjwoKCgLVu2HD9+PDs7u7S09O23366trX3jjTeMaan37du3q9XqhQsXUg0lj0fdqLEJDAw8evRoD/5AZ2fnUaNGeXp6Dh8+3NPT08vLS9hqUZCGRjgyMvI97WInPJefny+VStVq9a+//upBp3CSx6Iyamx++umnSZMmdfnbHRwchg8f3lA3fXx8rK2tO/KNKSkp48eP5zju3Llz48eP73IAA7Fs2bK9e/eGhIR8YZCfRkgMCpVRY5OWltbx6XJbW1upVKptM4cPHz569OinuvGJ+uvXr4+Li3N3d7927ZqNjU2Xfw5zf/zxh5ubW3V19Y0bNzw9PVnHIYaOzhs1NmKxuL1NlpaWw4YN8/T09Pb29vb29vT0dHV17cFdR0dHy+XytLS0qKgo5hendkd8fLxKpZozZw7VUNIR1I0am9ra2gkTJiQnJ5uZmQ0aNMjLy8vLy8vb29vLy2vw4MG6vsbp2rVrY8eOraurO3369MSJE3W6Lx0pLi52dXWtqKi4cuXKyJEjWcchPEBl1AjV1tZev3592LBhHXxbs2dFR0dv3brV1dX1xo0bffr00X+AbtqwYcOOHTtmzpzZszN1xIhRGSU9rGGp97CwMO0novJIWVmZq6traWlpcnLy888/zzoO4Qc6b5T0sKZLvcvlctZxOichIaG0tNTPz49qKOk46kaJTsTHx69bt87FxSUtLc3R0ZF1nA6pqKhwdXW9d+/euXPnfH19WcchvEHdKNGJiIgIX1/fwsLCVatWsc7SUZ988sm9e/dkMhnVUNIp1I0SXcnJyXn22WcfPnz47bffzp49m3WcJ6iqqnJzcysqKjpx4oR2QVBCOoi6UaIrbm5u2uXeli1bdufOHdZxnmDfvn1FRUUjR46cMmUK6yyEZ6gbJTrEcZy/v/+JEycM/Pyh2tpaqVSal5d37NixGTNmsI5DeIa6UaJDAoEgMTHR3t7+2LFjBw4cYB2nXfv378/Ly/P09Hz55ZdZZyH8Q2WU6Fa/fv3ef/99ACtWrNAuTWxo1Gq1NuHmzZt1/TGDxCjRHw3RuUWLFgUGBpaWli5atMgA30Q6cOBAVlaWh4fHa6+9xjoL4SUqo0QfPv30U7FYfOrUqcTERNZZmtFoNPHx8QCioqLMzc1ZxyG8RFNMRE8OHjw4f/58a2vr69evu7u7s45T79ChQ3PmzBkwYIBSqWSyNjUxAtSNEj2ZN29eUFBQRUVFaGioRqNhHadeXFwcgA0bNlANJV1G3SjRn3v37nl7e//xxx+7d+9euXIl6zj4/vvvZ8yY4ezsnJOTIxKJWMchfEXdKNEfiUSi/cynyMhI/a9g2trOnTsBrFu3jmoo6Q7qRom+hYSEfPnll+PGjbtw4QLDWZ1Tp075+flJJJJbt24x+WBWYjSoGyX69tFHHw0YMCAlJUV7tiYr27dvBxAREUE1lHQTdaOEgdOnT/v5+VlaWl6+fNnb21v/AS5duiSTyezs7PLy8uzs7PQfgBgT6kYJA5MnTw4LC6uurg4ODq6pqdF/gOjoaADh4eFUQ0n3UTdK2KioqBgxYoRSqdyyZcvWrVt1sYu6urqSkpLi4uKSkpKGO8XFxb/99tvhw4dtbGzy8vL48pHSxJBRGSXMXLx48cUXXxQIBMnJyaNHj+74N6pUqvvNFRUVFRYWthi8e/euWq1u8yfY2dmZm5unpaW5uLj00KshpovKKGFp9erVH3744bBhw1JTUzUaTUPD2KDNdrK94tiCubm5o6OjWCxu8VUkEm3ZsqWiosLb2/v8+fN0XE+6icooYUmlUo0cOTIzM7NXr14df5PUzs5O/EhDcWxRKyUSib29fXs/YdOmTbGxsQCmTZv2/fffW1hY9MzrISaJyihh7MqVK8nJyatWrRKJRA7tc3FxcXZ2dnBwkEgklpaW3dxpWVnZoEGD7t+/D2DRokWff/55T7wUYqKojBL21Gp1TU2NlZWVPncaExOzefPmhvtRUVH63DsxJnTCE2FJrVbv2rXL3NxczzUUwDvvvOPk5KS9v3nz5qSkJD0HIEaDyihhqaamxsfHh8mubWxs1qxZo73PcdySJUtOnTrFJAnhOzqoJ8xoNBq2i3ZUVFS4u7s3rFpqZ2d34cIFLy8vhpEIH1E3SpiZM2fO+vXrGQawtrZuGqCsrGzatGmGuWAUMWTUjRJmSkpKbt68OX78eIYZqqqqpFJpQUFBw4iXl9eFCxfoZFLScdSNEgYKCwtzc3MdHR3Z1lAAIpEoMjKy6Uh6enpgYCCTK/0JT1EZJQxcuXJlzJgx69atYx0EABYvXjxw4EDtfQtgA/DJTz9l0yqhpMPooJ6wUVZWlpmZOWbMGNZBACAxMXHx4sVDgSSgMVB0NB6dWErIY1AZJXr14MGDyMjId955x8PDg3WWRnV1dc8MHXo1O1vYdFQgwP79CAlhlYrwBR3UE70SCAROTk4ymcxAjui1LCws/m/NGmGLUY7DkiU4fZpJJMIj1I0S/ampqdFeDl9eXq5UKlmdeN82tRpubsjPbznepw/On8ezz7LIRPiBulGiPxMmTAgNDc3MzOzTp49h1VAA5uaIj29jvLwcAQGgk0lJ+6iMEv1RKBRSqdTX19egjugbBQWhzYWhCgvx8st48EDvgQg/0EE90YuMDAiF8PAAUFFRcevWLU9PT9aZ2nLkCF59te1NEybgxAl0+zP6iPGhbpToxbVrkMnwxhvIyLC2tjbQGgpg1iy0t5zJ2bNYsADUdpBWqIwSvXj9dWRnw9sbkyeD6XX0TyAQYMuWdrf+85+IjtZjGsIPdFBPdCw7GzExiIzEkCEAoFLh9m0MHsw61mPJZLh0qd2tu3Zh9Wo9piGGjrpRomN/+hOkUvj6Yu5c3LgBKytDr6EAtm173NaICNCiI6QJKqNElx48gK0tNm5ETg7GjIG/v0Ef0TeYPBkvvtjuVo7Do48oJQR0UE90SK3GsGEYPRobNkA7p1RVhcJCuLmxTtYBP//cbiV1coJcjlGj9BuIGC7qRonOmJsjNRVjxmDqVLzyCv7zH4hE/KihAHx9MXlyG+NBQcjIoBpKmqIySnTj3Dn8+itsbLBqFbKyMGUKZs/mxxF9g5iYZg8lEhw6hEOHIJEwCkQMFJVRohtZWZgwAXPm4Pp1WFlh5UoolXjrLdaxOmPcOEyfXn9/xgykpyMoiGkgYqCojBLdWLwY2dmQyTB9OqZMQUoKLC0xYADrWJ20dy+0160eP46nnoJA0LhJqWz2sM0RYhqojJKe9tNPeP11pKfD2hqrVkGpRGAg5s7Fxo2sk3VeWhri4iCXg+PAcZDLIRBAqWQdixgWKqOkp40ejTFjMG1a47TSW28hKwvLl7NO1nmxsZDL4e9f/9DfHwkJ9EHOpAU64YnoRnU1kpIQG4sBA7BtGyZNYh2o85RKSKUtL6LXDmZlAWi5tc3nExNA3SjRDaEQYWFQKhESgiVLEBXFOlDnZWVBJms5qF37RFtGAQgEjTepVK/xiMGgMkq6Rzuv0nBrMW5piSVLkJlZP1NvfDMw2vdMtbeG2kpMDJVR0g0KBaTSJ8/AWFjAyYlFvu6RSpGc3O4mQh6hMkq6wbhnYDw8IJNBoWg2uHs3ZDIY0rKmhDkqo6SrlEokJzfWUK3p05GcbDynBEVFISCgsZIqFAgPR1IS00zE4FiwDkB46/EzMNrDXr6/GervD45r9ipoIp60Qt0o0SXjmIFp+ioaeHi0LKmtR4hpoDJKuopmYAgBQGWUdB3NwBACgMoo6RaagSGEpphIt9AMDCF0TT0hhHTT/wMYh1aX6Oj0WQAAAABJRU5ErkJggg==\n" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ms_data = {\"HMDB0000122\": {\"mc\": [6, 12, 0, 6, 0, 0], \"exact_mass\": 180.063388116,\n", + " \"prescribed_masses\": [46.0055, 60.0211, 73.029]}} #\n", + "\n", + "annotation_generator = annotate_msn(ms_data, heavy_atoms=range(2, 13), max_valence=6,\n", + " max_atoms_available=2, max_n_substructures=3, ppm=5,\n", + " path_connectivity_db=os.path.join(notebook_data, \"connectivity.sqlite\"),\n", + " path_substructure_db=os.path.join(notebook_data, \"substructures.sqlite\"))\n", + "\n", + "annotated_structures = list(annotation_generator)[0]\n", + "print(\"Total structures generated: \" + str(len(annotated_structures)))\n", + "print(\"Number of structures generated by all peaks: \" + str(list(annotated_structures.values()).count(3)))\n", + "\n", + "print(annotated_structures[\"OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O\"])\n", + "Chem.MolFromSmiles(\"OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "name": "pycharm-6393377e", + "language": "python", + "display_name": "PyCharm (metaboverse)" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 272fb79..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -numpy -scipy -pandas -networkx -# rdkit -biopython -matplotlib -# nauty diff --git a/setup.py b/setup.py index 2b2a207..89fb891 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright © 2017-2019 Ralf Weber. +# Copyright © 2019-2020 Jack Gisby, Ralf Weber # # This file is part of MetaboBlend. # @@ -21,7 +21,6 @@ import setuptools -import sys import metaboblend @@ -30,10 +29,10 @@ def main(): setuptools.setup( name="metaboblend", version=metaboblend.__version__, - description="", + description="Python package for de novo structural elucidation of small molecules in mass spectrometry-based Metabolomics", long_description=open('README.rst').read(), - author="Ralf Weber", - author_email="r.j.weber@bham.ac.uk", + author="Ralf Weber, Jack Gisby", + author_email="r.j.weber@bham.ac.uk, jackgisby@gmail.com", url="https://github.com/computational-metabolomics/metaboblend", license="GPLv3", platforms=['Windows, UNIX'], @@ -41,7 +40,6 @@ def main(): packages=setuptools.find_packages(), test_suite='tests.suite', python_requires='>=3.7', - install_requires=open('requirements.txt').read().splitlines(), include_package_data=True, classifiers=[ "Programming Language :: Python :: 3", diff --git a/tests/__init__.py b/tests/__init__.py index 1d66a83..3aeb4d8 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright © 2019-2020 Ralf Weber +# Copyright © 2019-2020 Jack Gisby, Ralf Weber # # This file is part of MetaboBlend. # diff --git a/tests/build_structures/__init__.py b/tests/build_structures/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/build_structures/test_annotate.py b/tests/build_structures/test_annotate.py new file mode 100644 index 0000000..961c76c --- /dev/null +++ b/tests/build_structures/test_annotate.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import shutil +import pickle +import tempfile +import unittest +from rdkit import Chem + +from metaboblend.build_structures.annotate import * + + +class AnnotateTestCase(unittest.TestCase): + temp_results_dir = None + + @classmethod + def to_test_results(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) + + @classmethod + def to_test_data(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) + + @classmethod + def setUpClass(cls): + cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) + + shutil.copytree(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "test_data"), + cls.to_test_results("test_data")) + + def test_generate_structures(self): # tests vs build + + db = SubstructureDb(self.to_test_data("substructures.sqlite"), self.to_test_data("connectivity.sqlite")) + + fragments = [56.05, 60.0211, 68.0262, 56.0262] + + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + record_dicts = pickle.load(test_hmdbs) + for i, record_dict in enumerate(record_dicts.values()): + ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + "exact_mass": record_dict["exact_mass"]}} + + # test standard building + returned_smis = list( + generate_structures(ms_data, path_substructure_db=self.to_test_data("substructures.sqlite"), + write_csv_output=True, path_out=self.to_test_results(), + max_degree=6, max_atoms_available=2, max_n_substructures=3, + path_connectivity_db=self.to_test_data("connectivity.sqlite"), + minimum_frequency=None, yield_smis=True, isomeric_smiles=True, + retain_substructures=True)) + + returned_smis = returned_smis[0][record_dict["HMDB_ID"]] + + build_smis = build( + mf=[record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + exact_mass=record_dict["exact_mass"], + max_n_substructures=3, ppm=None, ncpus=None, table_name=None, isomeric_smiles=True, + db=db, tolerance=0.0001, prescribed_substructures=None, max_bde=None + ) + + self.assertEqual(set(build_smis.keys()), set(returned_smis)) + + ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + "exact_mass": record_dict["exact_mass"], + "prescribed_mass": fragments[i]}} + + # test prescribed building + returned_smis = list( + generate_structures(ms_data, path_substructure_db=self.to_test_data("substructures.sqlite"), + write_csv_output=True, path_out=self.to_test_results(), + max_degree=6, max_atoms_available=2, max_n_substructures=3, + path_connectivity_db=self.to_test_data("connectivity.sqlite"), + minimum_frequency=None, yield_smis=True, isomeric_smiles=True, + retain_substructures=False)) + + returned_smis = returned_smis[0][record_dict["HMDB_ID"]] + + prescribed_substructures = get_possible_fragment_ions(fragments[i], db) + + build_smis = build( + mf=[record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + exact_mass=record_dict["exact_mass"], max_n_substructures=3, ppm=0, + ncpus=None, table_name=None, isomeric_smiles=True, db=db, tolerance=0.0001, + prescribed_substructures=prescribed_substructures, max_bde=None + ) + + self.assertEqual(set(build_smis.keys()), set(returned_smis)) + + ms_data = {} + for i, record_dict in enumerate(record_dicts.values()): + record_dict["mol"] = Chem.MolFromSmiles(record_dict["smiles"]) + ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + "exact_mass": record_dict["exact_mass"], + "prescribed_masses": None} + + # test building with multiple inputs + returned_smi_list = list( + generate_structures(ms_data, path_substructure_db=self.to_test_data("substructures.sqlite"), + write_csv_output=True, path_out=self.to_test_results(), + max_degree=6, max_atoms_available=2, max_n_substructures=3, + path_connectivity_db=self.to_test_data("connectivity.sqlite"), + minimum_frequency=None, yield_smis=True, isomeric_smiles=True, + retain_substructures=False)) + + for i, record_dict in enumerate(record_dicts.values()): + build_smis = build( + mf=[record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + exact_mass=record_dict["exact_mass"], + max_n_substructures=3, ppm=None, ncpus=None, table_name=None, isomeric_smiles=True, + prescribed_substructures=None, db=db, tolerance=0.0001, max_bde=None + ) + + self.assertEqual(set(build_smis.keys()), set(returned_smi_list[i][record_dict["HMDB_ID"]])) + + db.close() + + def test_annotate_msn(self): # tests vs build + + db = SubstructureDb(self.to_test_data("substructures.sqlite")) + + overall_lens = [3, 41, 2, 0] + smis = [{'NCCc1ccc(O)c(O)c1', 'NCCc1cc(O)ccc1O', 'NCCc1cc(O)cc(O)c1'}, + None, + {'N[C@@H](Cc1cccc(O)c1)C(=O)O', 'N[C@@H](Cc1ccc(O)cc1)C(=O)O'}, + set()] + freqs = [1, 0, 0, 0] + + fragments = [56.05, 60.0211, 68.0262, 56.0262] + + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + record_dicts = pickle.load(test_hmdbs) + for i, record_dict in enumerate(record_dicts.values()): + + if not os.path.exists(self.to_test_results("annotate")): + os.mkdir(self.to_test_results("annotate")) + + ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + "exact_mass": record_dict["exact_mass"], + "neutral_fragment_masses": fragments}} + + # test standard building + returned_smis = list(annotate_msn( + ms_data, max_degree=6, max_atoms_available=2, max_n_substructures=3, + write_csv_output=True, retain_substructures=False, path_out=self.to_test_results(), + path_connectivity_db=self.to_test_data("connectivity.sqlite"), + path_substructure_db=self.to_test_data("substructures.sqlite"), + minimum_frequency=None, yield_smis=True, isomeric_smiles=True + )) + + returned_smis = returned_smis[0][record_dict["HMDB_ID"]] + + self.assertEqual(len([t[1] for t in returned_smis if t[1] > 1]), freqs[i]) + + if smis[i] is not None: + self.assertEqual(set(t[0] for t in returned_smis), smis[i]) + + if i == 0: + self.assertEqual(returned_smis[0][1], 3) + + ms_data = {} + for i, record_dict in enumerate(record_dicts.values()): + record_dict["mol"] = Chem.MolFromSmiles(record_dict["smiles"]) + ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + "exact_mass": record_dict["exact_mass"], + "neutral_fragment_masses": fragments} + + os.mkdir(self.to_test_results("annotate_multi")) + + # test building with multiple inputs + returned_smi_list = list(annotate_msn( + ms_data, max_degree=6, max_atoms_available=2, max_n_substructures=3, + path_out=self.to_test_results("annotate_multi"), write_csv_output=True, + path_connectivity_db=self.to_test_data("connectivity.sqlite"), + path_substructure_db=self.to_test_data("substructures.sqlite"), + minimum_frequency=None, yield_smis=True, + isomeric_smiles=True, retain_substructures=False + )) + + for i, record_dict in enumerate(record_dicts.values()): + self.assertEqual(len(returned_smi_list[i][record_dict["HMDB_ID"]]), overall_lens[i]) + + db.close() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/build_structures/test_build.py b/tests/build_structures/test_build.py new file mode 100644 index 0000000..722c808 --- /dev/null +++ b/tests/build_structures/test_build.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import shutil +import pickle +import tempfile +import unittest + +from metaboblend.databases.substructures import SubstructureDb +from metaboblend.build_structures.build import * + + +class BuildTestCase(unittest.TestCase): + temp_results_dir = None + + @classmethod + def to_test_results(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) + + @classmethod + def to_test_data(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) + + @classmethod + def setUpClass(cls): + cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) + + shutil.copytree(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "test_data"), + cls.to_test_results("test_data")) + + def test_build(self): # core - all other build functions rely on + + db = SubstructureDb(self.to_test_data("substructures.sqlite"), self.to_test_data("connectivity.sqlite")) + + # ref data + smis = [{'NCCc1cc(O)ccc1O', 'NCCc1cccc(O)c1O', 'NCCc1cc(O)cc(O)c1', 'NCCc1ccc(O)c(O)c1'}, + None, + {'N[C@@H](Cc1ccc(O)cc1)C(=O)O', 'N[C@@H](Cc1cccc(O)c1)C(=O)O', 'N[C@H](Cc1ccc(O)cc1)C(=O)O'}, + None] + std_lens = [4, 47, 3, 1892] + fragments = [56.05, 60.0211, 68.0262, 56.0262] + exp_lens = [1, 41, 2, 0] + + # hmdb records to build from + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + record_dicts = pickle.load(test_hmdbs) + + for i, record_dict in enumerate(record_dicts.values()): + + # test standard building + built_smis = build( + mf=[record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + exact_mass=record_dict["exact_mass"], max_n_substructures=3, db=db, ppm=None, ncpus=None, + table_name=None, isomeric_smiles=True, prescribed_substructures=None, + tolerance=None, max_bde=None + ) + + self.assertEqual(len(built_smis), std_lens[i]) + + if smis[i] is not None: + self.assertEqual(set(built_smis.keys()), smis[i]) + + # test prescribed substructure building + built_smis = build( + mf=[record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + exact_mass=record_dict["exact_mass"], max_n_substructures=3, tolerance=0.0001, + prescribed_substructures=get_possible_fragment_ions(fragments[i], db, 2, 5, 0.0001), ppm=15, + ncpus=None, isomeric_smiles=True, db=db, table_name=None, max_bde=None + ) + + if i == 2: + self.assertEqual(set(built_smis.keys()), + {'N[C@@H](Cc1ccc(O)cc1)C(=O)O', 'N[C@@H](Cc1cccc(O)c1)C(=O)O'}) + + self.assertEqual(len(built_smis.keys()), exp_lens[i]) + + db.close() + + def test_substructure_combination_build(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite"), + self.to_test_data("connectivity.sqlite")) + + ec_products = [[(3, 3, 0, 1, 0, 0), (5, 8, 1, 1, 0, 0)], + [(3, 5, 0, 1, 0, 0), (2, 4, 1, 1, 0, 0), (4, 2, 0, 1, 0, 0)], + [(3, 5, 1, 0, 0, 0), (2, 3, 0, 2, 0, 0)], + [(3, 6, 0, 4, 0, 0), (5, 9, 0, 3, 0, 0), (4, 7, 0, 4, 0, 0)]] + configs_iso = db.k_configs() + lens = [3, 0, 0, 44] + + for i, ec_product in enumerate(ec_products): + substructure_subset = db.select_substructures(ec_product, None) + + smis = substructure_combination_build(substructure_subset, configs_iso, prescribed_method=False, + isomeric_smiles=True, bond_enthalpies=get_bond_enthalpies(), + max_bde=None) + + self.assertEqual(len(smis.keys()), lens[i]) + + if i == 0: + self.assertEqual(list(smis.keys()), ['NCCc1ccc(O)c(O)c1', 'NCCc1cc(O)ccc1O', 'NCCc1cc(O)cc(O)c1']) + + db.close() + + def test_build_from_subsets(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite")) + + mcs = [[12, 22, 0, 11, 0, 0], [10, 0, 0, 0, 0, 0], + [9, 11, 1, 3, 0, 0], [9, 11, 1, 3, 0, 0]] + exact_subsets = [(103.0395, 119.0344, 120.0423), + (84.0449, 97.029), (50.0156, 57.0215, 74.0368), (50.0156, 57.034, 74.0242)] + + lens = [13, 0, 1, 1] + + for i, mc, exact_subset in zip(range(len(mcs)), mcs, exact_subsets): + substructure_subsets = build_from_subsets(exact_subset, mc, None, db) + + if i == 1: + self.assertEqual(len(substructure_subsets), 0) + else: + self.assertEqual(len(substructure_subsets[0][0]), lens[i]) + + if i == 2: + del substructure_subsets[0][0][0]["mol"] + self.assertEqual(substructure_subsets[0][0], + [{'atoms_available': 2, + 'bond_types': {1: [1.0, 1.5], 4: [1.5, 1.0]}, + 'degree_atoms': {1: 2, 4: 2}, + 'dummies': [0, 2, 3, 5], + 'smiles': '*c1:*:*:c(*)cc1', + 'valence': 4}]) + + db.close() + + def test_gen_subs_table(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite"), "") + table_name = gen_subs_table(db, 5, 6, 4, 2, 500) + + i = 0 + db.cursor.execute("SELECT heavy_atoms, valence, atoms_available FROM %s" % table_name + "_substructures") + for row in db.cursor.fetchall(): + i += 1 + + self.assertTrue(row[0] in range(5, 7)) + self.assertTrue(row[1] <= 4) + self.assertTrue(row[2] <= 2) + + self.assertEqual(i, 57) + + db.close() + + def test_subset_sum(self): # also tests find_path + + self.assertEqual([s_sum for s_sum in subset_sum([1, 2, 3, 4], 5)], [[2, 3], [1, 4]]) + + self.assertEqual(len(list(subset_sum(list(range(60)), 70, 3))), 378) + self.assertEqual(len(list(subset_sum(list(range(60)), 70, 1000))), 29884) + + def test_combine_ecs(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite"), "") + self.assertEqual(combine_mfs([54.0106, 69.0578], db, None, "0_0001"), + [[(3, 2, 0, 1, 0, 0)], [(4, 7, 1, 0, 0, 0)]]) + self.assertEqual(combine_mfs([54, 69], db, None, "1"), + [[(3, 2, 0, 1, 0, 0)], [(4, 7, 1, 0, 0, 0)]]) + self.assertEqual(combine_mfs([54.0101, 69.0580], db, None, "0_0001"), []) + + db.close() + + def test_reindex_atoms(self): + + substructure_combinations = [ + [{'smiles': '*C(*)C(=O)O', 'mol': None, 'bond_types': {1: [1.0, 1.0]}, 'degree_atoms': {1: 2}, + 'valence': 2, 'atoms_available': 1, 'dummies': [0, 2]}, + {'smiles': 'NCCc1c:*:*:cc1', 'mol': None, 'bond_types': {4: [1.5], 6: [1.5], 7: [1.5]}, + 'degree_atoms': {4: 1, 7: 1}, 'valence': 2, 'atoms_available': 2, 'dummies': [5, 6]}], + [{'smiles': '*[C@@H](O)[C@@H](*)O', 'mol': None, 'bond_types': {1: [1.0], 3: [1.0]}, + 'degree_atoms': {1: 1, 3: 1}, 'valence': 2, 'atoms_available': 2, 'dummies': [0, 5]}, + {'smiles': 'OC1**[C@@H](O)[C@H](O)[C@H]1O', 'mol': None, 'bond_types': {0: [1.0], 3: [1.0], 4: [1.0]}, + 'degree_atoms': {0: 1, 4: 1}, 'valence': 2, 'atoms_available': 2, 'dummies': [2, 3]}], + [{'smiles': '*C[C@H](N)C(=O)O', 'mol': None, 'bond_types': {2: [1.0]}, 'degree_atoms': {2: 1}, + 'valence': 1, 'atoms_available': 1, 'dummies': [3]}, + {'smiles': '*c1ccc(O)cc1', 'mol': None, 'bond_types': {1: [1.0]}, 'degree_atoms': {1: 1}, + 'valence': 1, 'atoms_available': 1, 'dummies': [0]}] + ] + + reindexed = [ + [None, [1, 10, 13], [0, 2, 11, 12], + {1: [1.0, 1.0], 10: [1.5], 12: [1.5], 13: [1.5]}], + ["*[C@@H](O)[C@@H](*)O.OC1**[C@@H](O)[C@H](O)[C@H]1O", [1, 3, 6, 10], [0, 5, 8, 9], + {1: [1.0], 3: [1.0], 6: [1.0], 9: [1.0], 10: [1.0]}], + ["*C[C@H](N)C(=O)O.*c1ccc(O)cc1", [2, 8], [3, 7], {2: [1.0], 8: [1.0]}] + ] + + for substructure_combination, reindex in zip(substructure_combinations, reindexed): + substructure_combination[0]["mol"] = Chem.MolFromSmiles(substructure_combination[0]["smiles"], False) + substructure_combination[1]["mol"] = Chem.MolFromSmiles(substructure_combination[1]["smiles"], False) + + mol_comb, atoms_available, atoms_to_remove, bond_types, bond_mismatch = reindex_atoms( + substructure_combination) + + if mol_comb is None: + mol_comb_smiles = None + else: + mol_comb_smiles = Chem.MolToSmiles(mol_comb) + + self.assertEqual([mol_comb_smiles, atoms_available, atoms_to_remove, bond_types], reindex) + + def test_add_bonds(self): + + mol_comb = [Chem.MolFromSmiles("*C(*)C(=O)O.NCCc1c:*:*:cc1", False), + Chem.MolFromSmiles("*[C@@H](O)[C@@H](*)O.OC1**[C@@H](O)[C@H](O)[C@H]1O", False), + Chem.MolFromSmiles("*C[C@H](N)C(=O)O.*c1ccc(O)cc1", False)] + + atoms_available = [[1, 10, 13], [1, 3, 6, 10], [2, 8]] + + bond_types = [{1: [1.0, 1.0], 10: [1.5], 12: [1.5], 13: [1.5]}, + {1: [1.0], 3: [1.0], 6: [1.0], 9: [1.0], 10: [1.0]}, + {2: [1.0], 8: [1.0]}] + + edges = [((0, 1), (0, 2)), ((0, 2), (1, 3)), ((0, 1),)] + + mol_out = [None, "*[CH]1(O)OC2**[CH](O)(C(O)C2O)[CH]1(*)O", "*C[CH](N)(C(=O)O)c1(*)ccc(O)cc1"] + + for i in range(len(atoms_available)): + mol_e, total_bde = add_bonds( + mol_comb[i], + edges[i], + atoms_available[i], + bond_types[i], + get_bond_enthalpies() + ) + + if i == 0: + self.assertTrue(mol_e is None) + else: + self.assertEqual(Chem.MolToSmiles(mol_e.GetMol(), False), mol_out[i]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/databases/__init__.py b/tests/databases/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_auxiliary.py b/tests/databases/test_connectivity.py similarity index 75% rename from tests/test_auxiliary.py rename to tests/databases/test_connectivity.py index 15475b2..242d984 100644 --- a/tests/test_auxiliary.py +++ b/tests/databases/test_connectivity.py @@ -1,120 +1,151 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright © 2019-2020 Ralf Weber -# -# This file is part of MetaboBlend. -# -# MetaboBlend is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# MetaboBlend is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with MetaboBlend. If not, see . -# - - -import os -import unittest -from io import BytesIO -import shutil -import tempfile -import pickle -from metaboblend.auxiliary import * - - -class AuxiliaryTestCase(unittest.TestCase): - temp_results_dir = None - - @classmethod - def to_test_results(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) - - @classmethod - def to_test_data(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) - - @classmethod - def setUpClass(cls): - cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) - - shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"), - cls.to_test_results("test_data")) - - cls.lines_geng = [b'E?oo', b'ECO_', b'ECQ_', b'ECZ?', b'ECX_', b'ECYO', b'EEh_', b'EQhO'] - - with open(cls.to_test_data("mappings.pkl"), "rb") as mappings_pkl: - cls.mappings = pickle.load(mappings_pkl) - - with open(cls.to_test_data("gi_out.pkl"), "rb") as gi_out_pkl: - cls.gi_out = pickle.load(gi_out_pkl) - - cls.p_list = [] - for G, p in calculate_complete_multipartite_graphs([1, 2], 3): - cls.p_list.append(p) - cls.final_graph = G - - def test_calculate_complete_multipartite_graphs(self): - self.assertEqual(self.p_list, [(1, 1), (1, 2), (2, 2), (1, 1, 1), (1, 1, 2), (1, 2, 2), (2, 2, 2)]) - self.assertEqual(nx.number_of_edges(self.final_graph), 12) - self.assertEqual(nx.number_of_nodes(self.final_graph), 6) - self.assertEqual(list(self.final_graph.edges()), [(0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), - (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]) - self.assertEqual(list(self.final_graph.nodes()), [0, 1, 2, 3, 4, 5]) - - def test_draw_subgraph(self): - # INSERT: 1 CU 1 2 (2, 2) (3, 3) ((1, 2), (2, 1)) 4 3 - small_plt, small_sG = draw_subgraph([(0, 2), (1, 2), (1, 3)], ((1, 2), (2, 1))) - # INSERT: 1 DQo 2 3 (1, 2, 2) (2, 4, 2) ((2,), (2, 2), (1, 1)) 5 4 - large_plt, large_sG = draw_subgraph([(0, 1), (0, 2), (1, 3), (2, 4)], ((2,), (2, 2), (1, 1))) - - self.assertEqual(list(small_sG.nodes()), [0, 2, 1, 3]) - self.assertEqual(small_sG.number_of_nodes(), 4) - self.assertEqual(list(small_sG.edges()), [(0, 2), (2, 1), (1, 3)]) - self.assertEqual(small_sG.number_of_edges(), 3) - self.assertEqual(list(large_sG.nodes()), [0, 1, 2, 3, 4]) - self.assertEqual(large_sG.number_of_nodes(), 5) - self.assertEqual(list(large_sG.edges()), [(0, 1), (0, 2), (1, 3), (2, 4)]) - self.assertEqual(large_sG.number_of_edges(), 4) - - def test_graph_to_ri(self): - k_graph = graph_to_ri(self.final_graph, "k_graph") - self.assertEqual(nx.number_of_nodes(self.final_graph) + nx.number_of_edges(self.final_graph) + 3, - k_graph.count("\n")) - - for i, line_geng in enumerate(self.lines_geng): - sG = nx.read_graph6(BytesIO(line_geng)) - subgraph = graph_to_ri(sG, "subgraph") - - self.assertEqual(nx.number_of_nodes(sG) + nx.number_of_edges(sG) + 3, subgraph.count("\n")) - - def test_iso_aux(self): # tests graph_info, valences, sort_subgraphs - for line_geng, mappings, gi_val in zip(self.lines_geng, self.mappings, self.gi_out): - sG = nx.read_graph6(BytesIO(line_geng)) - - if len(mappings) > 0: - gi = graph_info(self.p_list[-1], sG, mappings, ) - self.assertEqual(gi, gi_val) # test gi vs reference values - - for m in mappings: - ug = nx.relabel_nodes(sG, m, copy=True) - vn = get_degrees(self.p_list[-1], ug) - self.assertEqual(len(vn), len(self.p_list[-1])) - [self.assertEqual(len(t), 2) for t in vn] - - for vn in gi: - sorted_subgraphs = sort_subgraphs(gi[vn]) - self.assertLessEqual(len(sorted_subgraphs), len(gi[vn])) - - for subgraph in gi[vn]: - self.assertTrue(sorted([tuple(sorted(e)) for e in subgraph]) in sorted_subgraphs) - - -if __name__ == '__main__': - unittest.main() +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import sys +import shutil +import unittest +from io import BytesIO + +from metaboblend.databases.connectivity import * + + +class ConnectivityTestCase(unittest.TestCase): + temp_results_dir = None + + @classmethod + def to_test_results(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) + + @classmethod + def to_test_data(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) + + @classmethod + def setUpClass(cls): + cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) + + shutil.copytree(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "test_data"), + cls.to_test_results("test_data")) + + cls.lines_geng = [b'E?oo', b'ECO_', b'ECQ_', b'ECZ?', b'ECX_', b'ECYO', b'EEh_', b'EQhO'] + + with open(cls.to_test_data("mappings.pkl"), "rb") as mappings_pkl: + cls.mappings = pickle.load(mappings_pkl) + + with open(cls.to_test_data("gi_out.pkl"), "rb") as gi_out_pkl: + cls.gi_out = pickle.load(gi_out_pkl) + + cls.p_list = [] + for G, p in calculate_complete_multipartite_graphs([1, 2], 3): + cls.p_list.append(p) + cls.final_graph = G + + def test_calculate_complete_multipartite_graphs(self): + + self.assertEqual(self.p_list, [(1, 1), (1, 2), (2, 2), (1, 1, 1), (1, 1, 2), (1, 2, 2), (2, 2, 2)]) + self.assertEqual(nx.number_of_edges(self.final_graph), 12) + self.assertEqual(nx.number_of_nodes(self.final_graph), 6) + self.assertEqual(list(self.final_graph.edges()), [(0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), + (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]) + self.assertEqual(list(self.final_graph.nodes()), [0, 1, 2, 3, 4, 5]) + + def test_draw_subgraph(self): + + # INSERT: 1 CU 1 2 (2, 2) (3, 3) ((1, 2), (2, 1)) 4 3 + small_plt, small_sG = draw_subgraph([(0, 2), (1, 2), (1, 3)], ((1, 2), (2, 1))) + # INSERT: 1 DQo 2 3 (1, 2, 2) (2, 4, 2) ((2,), (2, 2), (1, 1)) 5 4 + large_plt, large_sG = draw_subgraph([(0, 1), (0, 2), (1, 3), (2, 4)], ((2,), (2, 2), (1, 1))) + + self.assertEqual(list(small_sG.nodes()), [0, 2, 1, 3]) + self.assertEqual(small_sG.number_of_nodes(), 4) + self.assertEqual(list(small_sG.edges()), [(0, 2), (2, 1), (1, 3)]) + self.assertEqual(small_sG.number_of_edges(), 3) + self.assertEqual(list(large_sG.nodes()), [0, 1, 2, 3, 4]) + self.assertEqual(large_sG.number_of_nodes(), 5) + self.assertEqual(list(large_sG.edges()), [(0, 1), (0, 2), (1, 3), (2, 4)]) + self.assertEqual(large_sG.number_of_edges(), 4) + + def test_graph_to_ri(self): + + k_graph = graph_to_ri(self.final_graph, "k_graph") + self.assertEqual(nx.number_of_nodes(self.final_graph) + nx.number_of_edges(self.final_graph) + 3, + k_graph.count("\n")) + + for i, line_geng in enumerate(self.lines_geng): + sG = nx.read_graph6(BytesIO(line_geng)) + subgraph = graph_to_ri(sG, "subgraph") + + self.assertEqual(nx.number_of_nodes(sG) + nx.number_of_edges(sG) + 3, subgraph.count("\n")) + + def test_iso_aux(self): # tests graph_info, valences, sort_subgraphs + + for line_geng, mappings, gi_val in zip(self.lines_geng, self.mappings, self.gi_out): + sG = nx.read_graph6(BytesIO(line_geng)) + + if len(mappings) > 0: + gi = graph_info(self.p_list[-1], sG, mappings, ) + self.assertEqual(gi, gi_val) # test gi vs reference values + + for m in mappings: + ug = nx.relabel_nodes(sG, m, copy=True) + vn = get_degrees(self.p_list[-1], ug) + self.assertEqual(len(vn), len(self.p_list[-1])) + [self.assertEqual(len(t), 2) for t in vn] + + for vn in gi: + sorted_subgraphs = sort_subgraphs(gi[vn]) + self.assertLessEqual(len(sorted_subgraphs), len(gi[vn])) + + for subgraph in gi[vn]: + self.assertTrue(sorted([tuple(sorted(e)) for e in subgraph]) in sorted_subgraphs) + + def test_create_connectivity_database(self): + + pkg_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + + if sys.platform == "linux" or sys.platform == "linux2": + + self.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36") + + create_connectivity_database(self.to_test_results("connectivity.sqlite"), + 3, # sizes + [1, 2], # boxes + self.path_ri + ) + + ref_db = sqlite3.connect(self.to_test_data("connectivity.sqlite")) + ref_db_cursor = ref_db.cursor() + ref_db_cursor.execute("SELECT * FROM subgraphs") + + test_db = sqlite3.connect(self.to_test_results("connectivity.sqlite")) + test_db_cursor = test_db.cursor() + test_db_cursor.execute("SELECT * FROM subgraphs") + + test_rows = {} + for row in test_db_cursor.fetchall(): + test_rows[row[0]] = row + + ref_db.close() + test_db.close() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/databases/test_results.py b/tests/databases/test_results.py new file mode 100644 index 0000000..e77b769 --- /dev/null +++ b/tests/databases/test_results.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import shutil +import pickle +import unittest +import tempfile +from rdkit import Chem + +from metaboblend.build_structures.annotate import annotate_msn +from metaboblend.databases.results import * + + +class ResultsDbTestCase(unittest.TestCase): + temp_results_dir = None + + @classmethod + def to_test_results(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) + + @classmethod + def to_test_data(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) + + @classmethod + def setUpClass(cls): + cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) + + shutil.copytree(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "test_data"), + cls.to_test_results("test_data")) + + def test_results_db(self): # TODO: directly test each unit of ResultsDb + + fragments = [56.05, 60.0211, 68.0262, 56.0262] + + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + record_dicts = pickle.load(test_hmdbs) + + ms_data = {} + for i, record_dict in enumerate(record_dicts.values()): + record_dict["mol"] = Chem.MolFromSmiles(record_dict["smiles"]) + ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], + record_dict["O"], record_dict["P"], record_dict["S"]], + "exact_mass": record_dict["exact_mass"], + "neutral_fragment_masses": fragments} + + os.mkdir(self.to_test_results("test_results_db")) + list(annotate_msn( + ms_data, max_degree=6, max_atoms_available=2, max_n_substructures=3, + path_out=self.to_test_results("test_results_db"), write_csv_output=True, + path_connectivity_db=self.to_test_data("connectivity.sqlite"), + path_substructure_db=self.to_test_data("substructures.sqlite"), + minimum_frequency=None, yield_smis=True, + isomeric_smiles=True, retain_substructures=True + )) + + # is the sqlite database the size we expect? + self.assertEqual(os.path.getsize(self.to_test_results("test_results_db", "metaboblend_results.sqlite")), 86016) + + # are the csv files the same as the reference? + with open(self.to_test_results("test_results_db", "metaboblend_queries.csv"), "r") as results_file, \ + open(self.to_test_data("metaboblend_queries.csv"), "r") as test_file: + + for results_line, test_line in zip(results_file, test_file): + self.assertEqual(results_line, test_line) + + with open(self.to_test_results("test_results_db", "metaboblend_structures.csv"), "r") as results_file, \ + open(self.to_test_data("metaboblend_structures.csv"), "r") as test_file: + + for results_line, test_line in zip(results_file, test_file): + self.assertEqual(results_line, test_line) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_databases.py b/tests/databases/test_substructures.py similarity index 51% rename from tests/test_databases.py rename to tests/databases/test_substructures.py index 20c7ff7..033f196 100644 --- a/tests/test_databases.py +++ b/tests/databases/test_substructures.py @@ -1,371 +1,687 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright © 2019-2020 Ralf Weber -# -# This file is part of MetaboBlend. -# -# MetaboBlend is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# MetaboBlend is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with MetaboBlend. If not, see . -# - - -import os -import unittest -import shutil -import pickle -from metaboblend.databases import * -from metaboblend.parse import reformat_xml - - -class DatabasesTestCase(unittest.TestCase): - temp_results_dir = None - - @classmethod - def to_test_results(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) - - @classmethod - def to_test_data(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) - - @classmethod - def setUpClass(cls): - cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) - - shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"), - cls.to_test_results("test_data")) - - def test_reformat_xml(self): - reformat_xml(self.to_test_data("HMDB0000073_raw.xml")) - - with open(self.to_test_data("HMDB0000073_raw.xml"), "r", encoding="utf-8") as fn_hmdb: - xml_contents = fn_hmdb.readlines() - - self.assertTrue("hmdb" in xml_contents[1]) - self.assertTrue(len(xml_contents), 3283) - - def test_parse_xml(self): - hmdbs = ["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"] - lengths = [111, 109, 112, 108] - elements = ["C8H11NO2", "C6H12O6", "C9H11NO3", "C12H22O11"] - smis = ["NCCC1=CC(O)=C(O)C=C1", - "[H]C1(O)O[C@]([H])(CO)[C@@]([H])(O)[C@]([H])(O)[C@@]1([H])O", - "N[C@@H](CC1=CC=C(O)C=C1)C(O)=O", - "OC[C@H]1O[C@@H](O[C@H]2[C@H](O)[C@@H](O)[C@@H](O)O[C@@H]2CO)[C@H](O)[C@@H](O)[C@H]1O"] - - with open(self.to_test_data("parsed_records.dictionary"), "rb") as parsed: - parsed_records = pickle.load(parsed) - - for i, hmdb in enumerate(["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"]): - - for record_out in parse_xml(self.to_test_data(hmdb + ".xml")): - - self.assertEqual(len(record_out), lengths[i]) - self.assertEqual(record_out["accession"], hmdbs[i]) - self.assertEqual(record_out["smiles"], smis[i]) - self.assertEqual(record_out["chemical_formula"], elements[i]) - self.assertEqual(record_out, parsed_records[hmdb + ".xml"]) - - def test_filter_records(self): - with open(self.to_test_data("parsed_records.dictionary"), "rb") as p: - parsed_records = pickle.load(p) - - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - filtered_records = pickle.load(test_hmdbs) - - record_gen = filter_records(parsed_records.values(), isomeric_smiles=True) - test_filtered_records = {} - for record in record_gen: - del record["mol"] - test_filtered_records[record["HMDB_ID"]] = record - - self.assertEqual(test_filtered_records, filtered_records) - - def test_get_substructure_bond_idx(self): - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - record_dict = pickle.load(test_hmdbs)["HMDB0000186"] - mol = Chem.MolFromSmiles(record_dict["smiles"]) - - subs_mol = Chem.MolFromSmiles("OC[C@H]1OC[C@H](O)[C@@H](O)[C@H]1O") - self.assertEqual(get_substructure_bond_idx(subs_mol, mol), (0, 1, 2, 22, 3, 16, 17, 18, 19, 20, 21)) - - subs_mol = Chem.MolFromSmiles("OC[C@@H]1C[C@H](O)[C@@H](O)[C@@H](O)O1") - self.assertEqual(get_substructure_bond_idx(subs_mol, mol), (0, 1, 2, 22, 3, 4, 16, 17, 18, 19, 20)) - - def test_subset_sgs_sizes(self): - sgs = [[(0, 1, 2, 22, 3, 16, 17, 18, 19, 20, 21), (0, 1, 2, 22, 3, 4, 16, 17, 18, 19, 20)]] - - self.assertEqual(len(subset_sgs_sizes(sgs, -10, 100)[0]), 2) - self.assertEqual(len(subset_sgs_sizes(sgs, 100, 100)), 0) - self.assertEqual(len(subset_sgs_sizes(sgs, 0, 0)), 0) - self.assertEqual(len(subset_sgs_sizes(sgs, 11, 11)[0]), 2) - self.assertEqual(len(subset_sgs_sizes(sgs, 11, 12)[0]), 2) - self.assertEqual(len(subset_sgs_sizes(sgs, 10, 11)[0]), 2) - self.assertEqual(len(subset_sgs_sizes(sgs, 12, 100)), 0) - self.assertEqual(len(subset_sgs_sizes(sgs, 0, 10)), 0) - - def test_get_sgs(self): - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - record_dict = pickle.load(test_hmdbs)["HMDB0000186"] - record_dict["mol"] = Chem.MolFromSmiles(record_dict["smiles"]) - mol_ids = [bond.GetIdx() for bond in record_dict["mol"].GetBonds()] - - sgs = get_sgs(record_dict, 2, 9, method="exhaustive") - for edges in sgs: - for edge_set in edges: - self.assertTrue(2 <= len(edge_set) <= 9) - [self.assertTrue(bond in mol_ids) for bond in edge_set] - - sgs = get_sgs(record_dict, 0, 20, method="RECAP") - for edges in sgs: - for edge_set in edges: - self.assertTrue(0 <= len(edge_set) <= 20) - [self.assertTrue(bond in mol_ids) for bond in edge_set] - - sgs = get_sgs(record_dict, 0, 20, method="BRICS") - for edges in sgs: - for edge_set in edges: - self.assertTrue(0 <= len(edge_set) <= 20) - [self.assertTrue(bond in mol_ids) for bond in edge_set] - - def test_get_substructure(self): - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - record_dict = pickle.load(test_hmdbs)["HMDB0000186"] - mol = Chem.MolFromSmiles(record_dict["smiles"]) - - libs = [{'smiles': '*[C@@H]1O[C@H](CO)[C@H](O)[C@H](O)[C@H]1O', 'bond_types': {4: [1.0]}, - 'degree_atoms': {4: 1}, 'valence': 1, 'atoms_available': 1, 'dummies': [5]}, - {'smiles': '*O[C@@H]1O[C@H](CO)[C@H](*)[C@H](O)[C@H]1O', 'bond_types': {5: [1.0], 11: [1.0]}, - 'degree_atoms': {5: 1, 11: 1}, 'valence': 2, 'atoms_available': 2, 'dummies': [6, 12]}] - - for edges in [[(0, 1, 2, 22, 3, 16, 17, 18, 19, 20, 21), (0, 1, 2, 22, 3, 4, 16, 17, 18, 19, 20)]]: - for i, edge_idx in enumerate(edges): - lib = get_substructure(mol, edge_idx, isomeric_smiles=True) - del lib["mol"] - self.assertEqual(lib, libs[i]) - - def test_get_elements(self): - compositions = [{'C': 8, 'H': 11, 'N': 1, 'O': 2, 'P': 0, 'S': 0, '*': 0}, - {'C': 6, 'H': 12, 'N': 0, 'O': 6, 'P': 0, 'S': 0, '*': 0}, - {'C': 9, 'H': 11, 'N': 1, 'O': 3, 'P': 0, 'S': 0, '*': 0}, - {'C': 12, 'H': 22, 'N': 0, 'O': 11, 'P': 0, 'S': 0, '*': 0}] - - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - record_dicts = pickle.load(test_hmdbs) - for i, record_dict in enumerate(record_dicts.values()): - mol = Chem.MolFromSmiles(record_dict["smiles"]) - - self.assertEqual(get_elements(mol), compositions[i]) - - def test_calculate_exact_mass(self): - masses = [153.07897899999998, 180.06338999999997, 181.07389399999997, 342.1162150000005] - - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - record_dicts = pickle.load(test_hmdbs) - for i, record_dict in enumerate(record_dicts.values()): - mol = Chem.MolFromSmiles(record_dict["smiles"]) - - self.assertEqual(calculate_exact_mass(mol), masses[i]) - - ref_db = sqlite3.connect(self.to_test_data("substructures.sqlite")) - ref_db_cursor = ref_db.cursor() - ref_db_cursor.execute("SELECT exact_mass__0_0001, mol FROM substructures") - for row in ref_db_cursor.fetchall(): - self.assertEqual(round(calculate_exact_mass(Chem.Mol(row[1])), 4), row[0]) - - ref_db.close() - - def test_create_substructure_database(self): - records = [self.to_test_data(r + ".xml") for r in ["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"]] - - create_substructure_database(records, self.to_test_results("test_db.sqlite"), 4, 8, method="exhaustive", - isomeric_smiles=True) - - test_db = sqlite3.connect(self.to_test_results("test_db.sqlite")) - test_db_cursor = test_db.cursor() - - test_db_cursor.execute("""SELECT smiles, - heavy_atoms, - length, - exact_mass__1, - exact_mass__0_0001, - exact_mass, - C, - H, - N, - O, - P, - S, - valence, - valence_atoms, - atoms_available, - bond_types, - dummies - FROM substructures WHERE valence <= 4""") - - for i, row in enumerate(test_db_cursor.fetchall()): - if i == 0: - self.assertEqual(row, ('*:c(:*)CCN', 4, 10, 56, 56.05, 56.05002399999998, 3, 6, 1, 0, 0, 0, 2, '{3: 2}', - 1, '{3: [1.5, 1.5]}', '[4, 5]') - ) - - total_rows = i - - self.assertEqual(total_rows, 585) - - test_db_cursor.execute("SELECT * FROM hmdbid_substructures") - for i, row in enumerate(test_db_cursor.fetchall()): - if i == 0: - self.assertEqual(row, ('HMDB0000073', 1)) - total_rows = i - - self.assertEqual(total_rows, 1292) - - test_db_cursor.execute("SELECT * FROM compounds") - for i, row in enumerate(test_db_cursor.fetchall()): - if i == 0: - self.assertEqual(row, - ('HMDB0000073', 153.078979, 'C8H11NO2', 8, 11, 1, 2, 0, 0, 'NCCC1=CC(O)=C(O)C=C1')) - total_rows = i - - self.assertEqual(total_rows, 3) - - test_db_cursor.execute("SELECT heavy_atoms FROM substructures") - unique_ha = set() - for ha in test_db_cursor.fetchall(): - self.assertTrue(4 <= ha[0] <= 8) - unique_ha.add(ha[0]) - - [self.assertTrue(ha in unique_ha) for ha in [4, 5, 6, 7, 8]] - - test_db.close() - - def test_update_substructure_database(self): # requires create_compound_database from SubstructureDb - db = SubstructureDb(self.to_test_results("test_db.sqlite"), "") - db.create_compound_database() - db.close() - - for record in ["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"]: - record = self.to_test_data(record + ".xml") - - update_substructure_database(self.to_test_data(record), - self.to_test_results("test_db.sqlite"), 4, 8, - method="exhaustive", isomeric_smiles=True) - - shutil.copyfile(self.to_test_data("substructures.sqlite"), self.to_test_results("substructures_copy.sqlite")) - - test_db = sqlite3.connect(self.to_test_results("test_db.sqlite")) - test_db_cursor = test_db.cursor() - - test_db_cursor.execute("""SELECT smiles, - heavy_atoms, - length, - exact_mass__1, - exact_mass__0_0001, - exact_mass, - C, - H, - N, - O, - P, - S, - valence, - valence_atoms, - atoms_available, - bond_types, - dummies - FROM substructures WHERE valence <= 4""") - - for i, row in enumerate(test_db_cursor.fetchall()): - if i == 0: - self.assertEqual(row, ('*:c(:*)CCN', 4, 10, 56, 56.05, 56.05002399999998, 3, 6, 1, 0, 0, 0, 2, '{3: 2}', - 1, '{3: [1.5, 1.5]}', '[4, 5]')) - - total_rows = i - - self.assertEqual(total_rows, 585) - - test_db_cursor.execute("SELECT * FROM hmdbid_substructures") - for i, row in enumerate(test_db_cursor.fetchall()): - if i == 0: - self.assertEqual(row, ('HMDB0000073', 1)) - total_rows = i - - self.assertEqual(total_rows, 1292) - - test_db_cursor.execute("SELECT * FROM compounds") - for i, row in enumerate(test_db_cursor.fetchall()): - if i == 0: - self.assertEqual(row, - ('HMDB0000073', 153.078979, 'C8H11NO2', 8, 11, 1, 2, 0, 0, 'NCCC1=CC(O)=C(O)C=C1')) - total_rows = i - - self.assertEqual(total_rows, 3) - - test_db_cursor.execute("SELECT heavy_atoms FROM substructures") - unique_ha = set() - for ha in test_db_cursor.fetchall(): - self.assertTrue(4 <= ha[0] <= 8) - unique_ha.add(ha[0]) - - [self.assertTrue(ha in unique_ha) for ha in [4, 5, 6, 7, 8]] - - test_db.close() - - # small substructures - db = SubstructureDb(self.to_test_results("test_db.sqlite"), "") - db.create_compound_database() - db.close() - - for record in ["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"]: - record = self.to_test_data(record + ".xml") - - update_substructure_database(self.to_test_data(record), - self.to_test_results("test_db.sqlite"), 1, 1, - method="exhaustive", isomeric_smiles=True) - - test_db = sqlite3.connect(self.to_test_results("test_db.sqlite")) - test_db_cursor = test_db.cursor() - - test_db_cursor.execute("""SELECT smiles, - heavy_atoms, - length, - exact_mass__1, - exact_mass__0_0001, - exact_mass, - C, - H, - N, - O, - P, - S, - valence, - valence_atoms, - atoms_available, - bond_types, - dummies - FROM substructures WHERE valence <= 4""") - - for i, row in enumerate(test_db_cursor.fetchall()): - if i == 0: - self.assertEqual(row, ('*N', 1, 3, 16, 16.0187, 16.018724, 0, 2, 1, 0, 0, 0, 1, - '{0: 1}', 1, '{0: [1.0]}', '[1]')) - - self.assertEqual(i, 8) - - test_db.close() - - -if __name__ == '__main__': - unittest.main() +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import shutil +import tempfile +import unittest + +from metaboblend.parse import reformat_xml +from metaboblend.databases.substructures import * + + +class DatabasesTestCase(unittest.TestCase): + temp_results_dir = None + + @classmethod + def to_test_results(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) + + @classmethod + def to_test_data(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) + + @classmethod + def setUpClass(cls): + cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) + + shutil.copytree(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "test_data"), + cls.to_test_results("test_data")) + + def test_reformat_xml(self): + + reformat_xml(self.to_test_data("HMDB0000073_raw.xml")) + + with open(self.to_test_data("HMDB0000073_raw.xml"), "r", encoding="utf-8") as fn_hmdb: + xml_contents = fn_hmdb.readlines() + + self.assertTrue("hmdb" in xml_contents[1]) + self.assertTrue(len(xml_contents), 3283) + + def test_parse_xml(self): + + hmdbs = ["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"] + lengths = [111, 109, 112, 108] + elements = ["C8H11NO2", "C6H12O6", "C9H11NO3", "C12H22O11"] + smis = ["NCCC1=CC(O)=C(O)C=C1", + "[H]C1(O)O[C@]([H])(CO)[C@@]([H])(O)[C@]([H])(O)[C@@]1([H])O", + "N[C@@H](CC1=CC=C(O)C=C1)C(O)=O", + "OC[C@H]1O[C@@H](O[C@H]2[C@H](O)[C@@H](O)[C@@H](O)O[C@@H]2CO)[C@H](O)[C@@H](O)[C@H]1O"] + + with open(self.to_test_data("parsed_records.dictionary"), "rb") as parsed: + parsed_records = pickle.load(parsed) + + for i, hmdb in enumerate(["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"]): + + for record_out in parse_xml(self.to_test_data(hmdb + ".xml")): + + self.assertEqual(len(record_out), lengths[i]) + self.assertEqual(record_out["accession"], hmdbs[i]) + self.assertEqual(record_out["smiles"], smis[i]) + self.assertEqual(record_out["chemical_formula"], elements[i]) + self.assertEqual(record_out, parsed_records[hmdb + ".xml"]) + + def test_filter_records(self): + + with open(self.to_test_data("parsed_records.dictionary"), "rb") as p: + parsed_records = pickle.load(p) + + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + filtered_records = pickle.load(test_hmdbs) + + record_gen = filter_records(parsed_records.values(), isomeric_smiles=True) + test_filtered_records = {} + for record in record_gen: + del record["mol"] + test_filtered_records[record["HMDB_ID"]] = record + + self.assertEqual(test_filtered_records, filtered_records) + + def test_get_substructure_bond_idx(self): + + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + record_dict = pickle.load(test_hmdbs)["HMDB0000186"] + mol = Chem.MolFromSmiles(record_dict["smiles"]) + + subs_mol = Chem.MolFromSmiles("OC[C@H]1OC[C@H](O)[C@@H](O)[C@H]1O") + self.assertEqual(get_substructure_bond_idx(subs_mol, mol), (0, 1, 2, 22, 3, 16, 17, 18, 19, 20, 21)) + + subs_mol = Chem.MolFromSmiles("OC[C@@H]1C[C@H](O)[C@@H](O)[C@@H](O)O1") + self.assertEqual(get_substructure_bond_idx(subs_mol, mol), (0, 1, 2, 22, 3, 4, 16, 17, 18, 19, 20)) + + def test_subset_sgs_sizes(self): + + sgs = [[(0, 1, 2, 22, 3, 16, 17, 18, 19, 20, 21), (0, 1, 2, 22, 3, 4, 16, 17, 18, 19, 20)]] + + self.assertEqual(len(subset_sgs_sizes(sgs, -10, 100)[0]), 2) + self.assertEqual(len(subset_sgs_sizes(sgs, 100, 100)), 0) + self.assertEqual(len(subset_sgs_sizes(sgs, 0, 0)), 0) + self.assertEqual(len(subset_sgs_sizes(sgs, 11, 11)[0]), 2) + self.assertEqual(len(subset_sgs_sizes(sgs, 11, 12)[0]), 2) + self.assertEqual(len(subset_sgs_sizes(sgs, 10, 11)[0]), 2) + self.assertEqual(len(subset_sgs_sizes(sgs, 12, 100)), 0) + self.assertEqual(len(subset_sgs_sizes(sgs, 0, 10)), 0) + + def test_get_sgs(self): + + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + record_dict = pickle.load(test_hmdbs)["HMDB0000186"] + record_dict["mol"] = Chem.MolFromSmiles(record_dict["smiles"]) + mol_ids = [bond.GetIdx() for bond in record_dict["mol"].GetBonds()] + + sgs = get_sgs(record_dict, 2, 9, method="exhaustive") + for edges in sgs: + for edge_set in edges: + self.assertTrue(2 <= len(edge_set) <= 9) + [self.assertTrue(bond in mol_ids) for bond in edge_set] + + sgs = get_sgs(record_dict, 0, 20, method="RECAP") + for edges in sgs: + for edge_set in edges: + self.assertTrue(0 <= len(edge_set) <= 20) + [self.assertTrue(bond in mol_ids) for bond in edge_set] + + sgs = get_sgs(record_dict, 0, 20, method="BRICS") + for edges in sgs: + for edge_set in edges: + self.assertTrue(0 <= len(edge_set) <= 20) + [self.assertTrue(bond in mol_ids) for bond in edge_set] + + def test_get_substructure(self): + + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + record_dict = pickle.load(test_hmdbs)["HMDB0000186"] + mol = Chem.MolFromSmiles(record_dict["smiles"]) + + libs = [{'smiles': '*[C@@H]1O[C@H](CO)[C@H](O)[C@H](O)[C@H]1O', 'bond_types': {4: [1.0]}, + 'degree_atoms': {4: 1}, 'valence': 1, 'atoms_available': 1, 'dummies': [5]}, + {'smiles': '*O[C@@H]1O[C@H](CO)[C@H](*)[C@H](O)[C@H]1O', 'bond_types': {5: [1.0], 11: [1.0]}, + 'degree_atoms': {5: 1, 11: 1}, 'valence': 2, 'atoms_available': 2, 'dummies': [6, 12]}] + + for edges in [[(0, 1, 2, 22, 3, 16, 17, 18, 19, 20, 21), (0, 1, 2, 22, 3, 4, 16, 17, 18, 19, 20)]]: + for i, edge_idx in enumerate(edges): + lib = get_substructure(mol, edge_idx, isomeric_smiles=True) + del lib["mol"] + self.assertEqual(lib, libs[i]) + + def test_get_elements(self): + + compositions = [{'C': 8, 'H': 11, 'N': 1, 'O': 2, 'P': 0, 'S': 0, '*': 0}, + {'C': 6, 'H': 12, 'N': 0, 'O': 6, 'P': 0, 'S': 0, '*': 0}, + {'C': 9, 'H': 11, 'N': 1, 'O': 3, 'P': 0, 'S': 0, '*': 0}, + {'C': 12, 'H': 22, 'N': 0, 'O': 11, 'P': 0, 'S': 0, '*': 0}] + + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + record_dicts = pickle.load(test_hmdbs) + for i, record_dict in enumerate(record_dicts.values()): + mol = Chem.MolFromSmiles(record_dict["smiles"]) + + self.assertEqual(get_elements(mol), compositions[i]) + + def test_calculate_exact_mass(self): + + masses = [153.07897899999998, 180.06338999999997, 181.07389399999997, 342.1162150000005] + + with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: + record_dicts = pickle.load(test_hmdbs) + for i, record_dict in enumerate(record_dicts.values()): + mol = Chem.MolFromSmiles(record_dict["smiles"]) + + self.assertEqual(calculate_exact_mass(mol), masses[i]) + + ref_db = sqlite3.connect(self.to_test_data("substructures.sqlite")) + ref_db_cursor = ref_db.cursor() + ref_db_cursor.execute("SELECT exact_mass__0_0001, mol FROM substructures") + for row in ref_db_cursor.fetchall(): + self.assertEqual(round(calculate_exact_mass(Chem.Mol(row[1])), 4), row[0]) + + ref_db.close() + + def test_create_substructure_database(self): + + records = [self.to_test_data(r + ".xml") for r in ["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"]] + + create_substructure_database(records, self.to_test_results("test_db.sqlite"), 4, 8, method="exhaustive", isomeric_smiles=True) + test_db = sqlite3.connect(self.to_test_results("test_db.sqlite")) + + test_db_cursor = test_db.cursor() + + test_db_cursor.execute("""SELECT smiles, + heavy_atoms, + length, + exact_mass__1, + exact_mass__0_0001, + exact_mass, + C, + H, + N, + O, + P, + S, + valence, + valence_atoms, + atoms_available, + bond_types, + dummies + FROM substructures WHERE valence <= 4""") + + for i, row in enumerate(test_db_cursor.fetchall()): + if i == 0: + self.assertEqual(row, ('*:c(:*)CCN', 4, 10, 56, 56.05, 56.05002399999998, 3, 6, 1, 0, 0, 0, 2, '{3: 2}', + 1, '{3: [1.5, 1.5]}', '[4, 5]') + ) + + total_rows = i + + self.assertEqual(total_rows, 139) + + test_db_cursor.execute("SELECT * FROM hmdbid_substructures") + for i, row in enumerate(test_db_cursor.fetchall()): + if i == 0: + self.assertEqual(row, ('HMDB0000073', 1)) + total_rows = i + + self.assertEqual(total_rows, 150) + + test_db_cursor.execute("SELECT * FROM compounds") + for i, row in enumerate(test_db_cursor.fetchall()): + if i == 0: + self.assertEqual(row, + ('HMDB0000073', 153.078979, 'C8H11NO2', 8, 11, 1, 2, 0, 0, 'NCCC1=CC(O)=C(O)C=C1')) + total_rows = i + + self.assertEqual(total_rows, 3) + + test_db_cursor.execute("SELECT heavy_atoms FROM substructures") + unique_ha = set() + for ha in test_db_cursor.fetchall(): + self.assertTrue(4 <= ha[0] <= 8) + unique_ha.add(ha[0]) + + [self.assertTrue(ha in unique_ha) for ha in [4, 5, 6, 7, 8]] + + test_db.close() + + def test_update_substructure_database(self): # requires create_compound_database from SubstructureDb + + db = SubstructureDb(self.to_test_results("test_db.sqlite"), "") + db.create_compound_database() + db.close() + + for record in ["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"]: + record = self.to_test_data(record + ".xml") + + update_substructure_database(self.to_test_data(record), + self.to_test_results("test_db.sqlite"), 4, 8, + method="exhaustive", isomeric_smiles=True) + + shutil.copyfile(self.to_test_data("substructures.sqlite"), self.to_test_results("substructures_copy.sqlite")) + + test_db = sqlite3.connect(self.to_test_results("test_db.sqlite")) + test_db_cursor = test_db.cursor() + + test_db_cursor.execute("""SELECT smiles, + heavy_atoms, + length, + exact_mass__1, + exact_mass__0_0001, + exact_mass, + C, + H, + N, + O, + P, + S, + valence, + valence_atoms, + atoms_available, + bond_types, + dummies + FROM substructures WHERE valence <= 4""") + + for i, row in enumerate(test_db_cursor.fetchall()): + if i == 0: + self.assertEqual(row, ('*:c(:*)CCN', 4, 10, 56, 56.05, 56.05002399999998, 3, 6, 1, 0, 0, 0, 2, '{3: 2}', + 1, '{3: [1.5, 1.5]}', '[4, 5]')) + + total_rows = i + + self.assertEqual(total_rows, 575) + + test_db_cursor.execute("SELECT * FROM hmdbid_substructures") + for i, row in enumerate(test_db_cursor.fetchall()): + if i == 0: + self.assertEqual(row, ('HMDB0000073', 1)) + total_rows = i + + self.assertEqual(total_rows, 1292) + + test_db_cursor.execute("SELECT * FROM compounds") + for i, row in enumerate(test_db_cursor.fetchall()): + if i == 0: + self.assertEqual(row, + ('HMDB0000073', 153.078979, 'C8H11NO2', 8, 11, 1, 2, 0, 0, 'NCCC1=CC(O)=C(O)C=C1')) + total_rows = i + + self.assertEqual(total_rows, 3) + + test_db_cursor.execute("SELECT heavy_atoms FROM substructures") + unique_ha = set() + for ha in test_db_cursor.fetchall(): + self.assertTrue(4 <= ha[0] <= 8) + unique_ha.add(ha[0]) + + [self.assertTrue(ha in unique_ha) for ha in [4, 5, 6, 7, 8]] + + test_db.close() + + # small substructures + db = SubstructureDb(self.to_test_results("test_db.sqlite"), "") + db.create_compound_database() + db.close() + + for record in ["HMDB0000073", "HMDB0000122", "HMDB0000158", "HMDB0000186"]: + record = self.to_test_data(record + ".xml") + + update_substructure_database(self.to_test_data(record), + self.to_test_results("test_db.sqlite"), 1, 1, + method="exhaustive", isomeric_smiles=True) + + test_db = sqlite3.connect(self.to_test_results("test_db.sqlite")) + test_db_cursor = test_db.cursor() + + test_db_cursor.execute("""SELECT smiles, + heavy_atoms, + length, + exact_mass__1, + exact_mass__0_0001, + exact_mass, + C, + H, + N, + O, + P, + S, + valence, + valence_atoms, + atoms_available, + bond_types, + dummies + FROM substructures WHERE valence <= 4""") + + for i, row in enumerate(test_db_cursor.fetchall()): + if i == 0: + self.assertEqual(row, ('*N', 1, 3, 16, 16.0187, 16.018724, 0, 2, 1, 0, 0, 0, 1, + '{0: 1}', 1, '{0: [1.0]}', '[1]')) + + self.assertEqual(i, 8) + + test_db.close() + + def test_calculate_hydrogen_rearrangements(self): + + fragment_ions = [(('C', False), ('C', False)), (('C', False), ('C', False), ('C', False), ('C', True)), + (('P', False), ('C', False), ('P', False), ('P', True)), + (('C', True), ('C', False), ('C', False), ('C', False)), + (('C', False), ('P', False), ('P', True), ('P', False)), + (('C', False), ('N', False), ('C', False), ('N', False)), + (('N', False), ('C', False), ('N', False), ('C', False)), + (), + (('C', False),)] + positive_results = [{0, -2}, {1, 3, -5, -3, -1}, {1, 3, 5, -5, -3, -1}, {1, 3, -5, -3, -1}, + {1, 3, -5, -3, -1}, {0, 2, -4, -2}, {0, 2, 4, -2}, {0}, {-1}] + negative_results = [{0, 2, -2}, {1, 3, 5, -5, -3, -1}, {1, 3, -5, -3, -1}, {1, 3, 5, -5, -3, -1}, + {1, 3, 5, -5, -3, -1}, {0, 2, 4, -4, -2}, {0, 2, 4, -2}, {0}, {1, -1}] + + for fragment_ion, positive_result, negative_result in zip(fragment_ions, positive_results, negative_results): + self.assertEqual(calculate_hydrogen_rearrangements(fragment_ion, "+"), positive_result) + self.assertEqual(calculate_hydrogen_rearrangements(fragment_ion, "-"), negative_result) + + +class SubstructureDbTestCase(unittest.TestCase): + temp_results_dir = None + + @classmethod + def to_test_results(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) + + @classmethod + def to_test_data(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) + + @classmethod + def setUpClass(cls): + cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) + + shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../test_data"), + cls.to_test_results("test_data")) + + def test_init(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite"), + self.to_test_data("connectivity.sqlite")) + + db.cursor.execute("SELECT * FROM substructures") + first_row = db.cursor.fetchone()[0:18] + self.assertEqual(first_row, (1, '*:c(:*)CCN', 4, 10, 56, 56.05, 56.05002399999998, 3, 6, 1, 0, 0, 0, 2, + '{3: 2}', 1, '{3: [1.5, 1.5]}', '[4, 5]')) + + self.assertTrue(Chem.MolFromSmiles(first_row[1], False)) + self.assertEqual(len(db.cursor.fetchall()), 140) + + db.cursor.execute("SELECT * FROM hmdbid_substructures") + first_row = db.cursor.fetchone() + self.assertEqual(first_row, ('HMDB0000073', 1)) + self.assertEqual(len(db.cursor.fetchall()), 150) + + db.cursor.execute("SELECT * FROM compounds") + first_row = db.cursor.fetchone() + self.assertEqual(first_row, ('HMDB0000073', 153.078979, 'C8H11NO2', 8, 11, 1, 2, 0, 0, 'NCCC1=CC(O)=C(O)C=C1')) + self.assertEqual(len(db.cursor.fetchall()), 3) + + db.cursor.execute("SELECT * FROM graphs.subgraphs") + first_row = db.cursor.fetchone() + self.assertEqual(first_row[0:9], (1, 1, b'A_', 2, '(1, 1)', '(1, 1)', '((1,), (1,))', 2, 1)) + self.assertEqual(len(db.cursor.fetchall()), 107) + + db.close() + + def test_select_compounds(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite")) + for i, cpd_entry in enumerate(db.select_compounds(["HMDB0000158", "HMDB0000122"])): + self.assertLessEqual(i, 2) + self.assertTrue(cpd_entry[0] == "HMDB0000158" or cpd_entry[0] == "HMDB0000122") + + db.close() + + def test_filter_hmdbid_substructures(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite")) + db.filter_hmdbid_substructures(2) + + db.cursor.execute("SELECT COUNT(*) FROM filtered_hmdbid_substructures GROUP BY hmdbid") + for i, hmdbid_count in enumerate(db.cursor.fetchall()): + self.assertGreater(hmdbid_count[0], 1) + + self.assertEqual(i, 3) + + db.close() + + def test_generate_substructure_network(self): # also tests get_substructure_network, get_single_edge and close + + db = SubstructureDb(self.to_test_data("substructures.sqlite")) + + self.assertEqual(db.get_single_edge([3, 4, 2]), {2: {2: None, 3: 1, 4: 1}, 3: {3: None, 4: 1}, 4: {4: None}}) + + std = db.generate_substructure_network(min_node_weight=2, return_networkx=True) + + db.cursor.execute("SELECT * FROM filtered_hmdbid_substructures") + for hmdb in db.cursor.fetchall(): + + self.assertTrue(hmdb[1] in std.nodes) + + db.cursor.execute("SELECT DISTINCT substructure_id FROM filtered_hmdbid_substructures") + self.assertEqual(len(db.cursor.fetchall()), 10) + self.assertEqual(std.number_of_nodes(), 10) + + self.assertEqual(std.number_of_edges(), 24) + + edge_count = [] + db.cursor.execute("SELECT * FROM substructure_graph") + for edge in db.cursor.fetchall(): + edge_count.append(std.get_edge_data(edge[0], edge[1])["weight"]) + + self.assertEqual(sum(edge_count), 48) + + db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + self.assertEqual(len(db.cursor.fetchall()), 6) + + db.cursor.execute("CREATE TABLE subset_substructures AS SELECT * FROM COMPOUNDS") + db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + self.assertEqual(len(db.cursor.fetchall()), 7) + + db.close() + + self.assertRaises(sqlite3.ProgrammingError, lambda: db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")) + + db = SubstructureDb(self.to_test_data("substructures.sqlite")) + db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + self.assertEqual(len(db.cursor.fetchall()), 7) + + db.close() + + def test_select_mass_values(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite")) + ests = db.select_mass_values("1", [], None) + exacts = db.select_mass_values("0_0001", [], None) + + self.assertEqual(len(ests), 49) + self.assertEqual(len(exacts), 65) + + for exact in exacts: + self.assertTrue(round(exact) in ests) + + self.assertEqual(db.select_mass_values("0_0001", [120, 87, 87], None), + [[120.0423], [87.0446], [87.0446]]) + self.assertEqual(db.select_mass_values("0_0001", [55, 80, 107], None), + [[55.0184, 55.0422], [80.0262], [107.0497, 107.0735]]) + + self.assertRaises(sqlite3.OperationalError, + lambda: db.select_mass_values("0_0001", [63, 63, 63], "")) + db.close() + + def test_select_mfs(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite")) + self.assertEqual(db.select_mfs(107.0735, None, "0_0001"), [(7, 9, 1, 0, 0, 0)]) + self.assertEqual(db.select_mfs(107.0735, None, "1"), []) + self.assertEqual(db.select_mfs(107, None, "0_0001"), []) + self.assertEqual(db.select_mfs(107.0735, None, "1"), []) + self.assertEqual(db.select_mfs(107, None, "1"), + [(7, 7, 0, 1, 0, 0), (7, 9, 1, 0, 0, 0)]) + + self.assertRaises(sqlite3.OperationalError, lambda: db.select_mfs(107.0735, "", "0_0001")) + + db.close() + + def test_k_configs(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite"), + self.to_test_data("connectivity.sqlite")) + + k_configs = db.k_configs() + self.assertEqual(len(k_configs), 67) + self.assertEqual(k_configs['((1,), (1,))'], [((0, 1),)]) + self.assertEqual(k_configs['((2, 2), (2, 2), (2, 2))'], + [((0, 2), (0, 4), (1, 3), (1, 5), (2, 4), (3, 5)), + ((0, 2), (0, 5), (1, 3), (1, 4), (2, 5), (3, 4)), + ((0, 3), (0, 5), (1, 2), (1, 4), (2, 4), (3, 5)), + ((0, 3), (0, 4), (1, 2), (1, 5), (2, 5), (3, 4))]) + + db.close() + + def test_select_substructures(self): + + db = SubstructureDb(self.to_test_data("substructures.sqlite")) + self.assertEqual(db.select_substructures([[2, 5, 0, 0, 0, 0]], None), []) + self.assertEqual(len(db.select_substructures([[3, 6, 1, 0, 0, 0]], None)[0]), 1) + self.assertEqual(list(db.select_substructures([[3, 6, 1, 0, 0, 0]], None)[0][0].keys()), + ['smiles', 'mol', 'bond_types', 'degree_atoms', 'valence', 'atoms_available', 'dummies']) + + substructures = list(db.select_substructures([[3, 6, 1, 0, 0, 0]], None)[0][0].values()) + self.assertEqual([item for i, item in enumerate(substructures) if i != 1], + ['*:c(:*)CCN', {3: [1.5, 1.5]}, {3: 2}, 2, 1, [4, 5]]) + + self.assertEqual(len(db.select_substructures([[2, 2, 0, 2, 0, 0]], None)[0]), 2) + self.assertEqual(list(db.select_substructures([[2, 2, 0, 2, 0, 0]], None)[0][0].keys()), + ['smiles', 'mol', 'bond_types', 'degree_atoms', 'valence', 'atoms_available', 'dummies']) + substructures = list(db.select_substructures([[2, 2, 0, 2, 0, 0]], None)[0][0].values()) + self.assertEqual([item for i, item in enumerate(substructures) if i != 1], + ['*:c(O)c(:*)O', {1: [1.5], 3: [1.5]}, {1: 1, 3: 1}, 2, 2, [0, 5]]) + + self.assertRaises(sqlite3.OperationalError, + lambda: db.select_substructures([[2, 5, 0, 0, 0, 0]], "")) + db.close() + + def test_create_compound_database(self): # also tests create_indexes + + db = SubstructureDb(self.to_test_results("substructures_new.sqlite")) + db.create_compound_database() + db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + self.assertEqual(len(db.cursor.fetchall()), 4) + + db.create_indexes() + db.close() + + shutil.copyfile(self.to_test_data("substructures.sqlite"), self.to_test_results("substructures_copy.sqlite")) + db = SubstructureDb(self.to_test_results("substructures_copy.sqlite"), + self.to_test_data("connectivity.sqlite")) + db.create_indexes() + db.create_compound_database() + db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + self.assertEqual(len(db.cursor.fetchall()), 4) + + db.cursor.execute("SELECT * FROM substructures") + self.assertEqual(len(db.cursor.fetchall()), 0) + + db.cursor.execute("SELECT * FROM hmdbid_substructures") + self.assertEqual(len(db.cursor.fetchall()), 0) + + db.cursor.execute("SELECT * FROM compounds") + self.assertEqual(len(db.cursor.fetchall()), 0) + + db.cursor.execute("SELECT * FROM graphs.subgraphs") + first_row = db.cursor.fetchone() + self.assertEqual(first_row[0:9], (1, 1, b'A_', 2, '(1, 1)', '(1, 1)', '((1,), (1,))', 2, 1)) + self.assertEqual(len(db.cursor.fetchall()), 107) + + db.create_indexes() + db.close() + + def test_calculate_possible_hydrogenations(self): # also tests insert_substructure_ion + + records = [self.to_test_data(r + ".xml") for r in ["HMDB0001245", "HMDB0000263"]] + + create_substructure_database(records, self.to_test_results("substructures.sqlite"), 3, 20, method="exhaustive", + isomeric_smiles=True, max_degree=6, max_atoms_available=2) + + db = SubstructureDb(self.to_test_results("substructures.sqlite")) + + search_statement = """SELECT smiles, hydrogen_modification, valence, substructure_ions.substructure_id + FROM substructure_ions + LEFT JOIN substructures ON substructure_ions.substructure_id = substructures.substructure_id + WHERE modified_exact_mass__0_0001 > ({} {} 1.007276) - 0.01 + AND modified_exact_mass__0_0001 < ({} {} 1.007276) + 0.01 + AND ion_mode_positive = {} + """ + + # HMDB0001245 - 2'-deoxycytidine 5'-diphosphate + mzs = [256.9616, 158.9248, 96.9691, 78.9585] + h_mods = [-2, -1, 1, -1] + valences = [2, 1, 1, 1] + smiles = ["*[C@H]1C[C@H](O)[C@@H](COP(=O)(O)OP(*)(=O)O)O1", "*P(=O)(O)OP(=O)(O)O", "*OP(=O)(O)O", "*P(=O)(O)O"] + + for mz, h_mod, valence, smile in zip(mzs, h_mods, valences, smiles): + + db.cursor.execute(search_statement.format(mz, "+", mz, "+", 0)) + substructure_found = False + + for substructure in db.cursor.fetchall(): + + if substructure[0] == smile: + + substructure_found = True + + self.assertEqual(substructure[1], h_mod) + self.assertEqual(substructure[2], valence) + + self.assertTrue(substructure_found) + + # HMDB0000263 - Phospho(enol)pyruvic acid + mzs = [62.9628, 64.9785, 80.9734, 89.0233, 94.9892, 98.9841, 104.9735, 116.973611, 122.9842, 140.9947, 150.9791] + h_mods = [-1, 1, -1, 1, 2, 1, -2, -1, -1, 3, -1] + valences = [3, 3, 1, 1, 4, 1, 2, 3, 1, 3, 1] + smiles = ['*OP(*)(*)=O', '*OP(*)(*)=O', "*P(=O)(O)O", "*OC(=C)C(=O)O", "*C(=*)OP(*)(=O)O", "*OP(=O)(O)O", + "*C(=C)OP(*)(=O)O", "*C(=O)C(=C)OP(*)(*)=O", "*C(=C)OP(=O)(O)O", + "*P(=O)(O)OC(=*)C(=O)O", "*P(=O)(O)OC(=C)C(=O)O"] + + for mz, h_mod, valence, smile in zip(mzs, h_mods, valences, smiles): + + db.cursor.execute(search_statement.format(mz, "-", mz, "-", 1)) + substructure_found = False + + for substructure in db.cursor.fetchall(): + + if substructure[0] == smile: + + substructure_found = True + + self.assertEqual(substructure[1], h_mod) + self.assertEqual(substructure[2], valence) + + break + + self.assertTrue(substructure_found) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_isomorphism_database.py b/tests/test_algorithms.py similarity index 51% rename from tests/test_isomorphism_database.py rename to tests/test_algorithms.py index 0a3608c..a208a82 100644 --- a/tests/test_isomorphism_database.py +++ b/tests/test_algorithms.py @@ -1,84 +1,68 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright © 2019-2020 Ralf Weber -# -# This file is part of MetaboBlend. -# -# MetaboBlend is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# MetaboBlend is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with MetaboBlend. If not, see . -# - - -import os -import sys -import unittest -import shutil -import tempfile -from metaboblend.databases import * - - -class IsomorphDbTestCase(unittest.TestCase): - temp_results_dir = None - - @classmethod - def to_test_results(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) - - @classmethod - def to_test_data(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) - - @classmethod - def setUpClass(cls): - cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) - - shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"), - cls.to_test_results("test_data")) - - def test_create_connectivity_database(self): - - pkg_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) - # TODO: add RI as dependency - - if sys.platform == "linux" or sys.platform == "linux2": - - self.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36") - - create_connectivity_database(self.to_test_results("connectivity.sqlite"), - 3, # sizes - [1, 2], # boxes - self.path_ri - ) - - ref_db = sqlite3.connect(self.to_test_data("connectivity.sqlite")) - ref_db_cursor = ref_db.cursor() - ref_db_cursor.execute("SELECT * FROM subgraphs") - - test_db = sqlite3.connect(self.to_test_results("connectivity.sqlite")) - test_db_cursor = test_db.cursor() - test_db_cursor.execute("SELECT * FROM subgraphs") - - test_rows = {} - for row in test_db_cursor.fetchall(): - test_rows[row[0]] = row - - # for row in ref_db_cursor.fetchall(): - # self.assertEqual(row, test_rows[row[0]]) - - ref_db.close() - test_db.close() - - -if __name__ == '__main__': - unittest.main() +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Jack Gisby, Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import os +import unittest +import shutil +import tempfile + +from metaboblend.algorithms import * + + +class AlgorithmsTestCase(unittest.TestCase): + temp_results_dir = None + + @classmethod + def to_test_results(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) + + @classmethod + def to_test_data(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) + + @classmethod + def setUpClass(cls): + cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) + + shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"), + cls.to_test_results("test_data")) + + def test_subset_sum(self): # also tests find_path + + self.assertEqual([s_sum for s_sum in subset_sum([1, 2, 3, 4], 5)], [[2, 3], [1, 4]]) + + self.assertEqual(len(list(subset_sum(list(range(60)), 70, 3))), 378) + self.assertEqual(len(list(subset_sum(list(range(60)), 70, 1000))), 29884) + + def test_cosine_spectrum_similarity(self): + + self.assertAlmostEqual(cosine_spectrum_similarity([1, 2, 3], [2, 3, 4]), 0.9892759073362614, places=5) + + # swapping real and theoretical makes no difference + self.assertEqual(cosine_spectrum_similarity([1, 2, 3], [2, 2, 3]), cosine_spectrum_similarity([2, 2, 3], [1, 2, 3])) + + # real is same as theoretical + self.assertEqual(cosine_spectrum_similarity([1, 2, 3], [1, 2, 3]), 1) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_build_structures.py b/tests/test_build_structures.py deleted file mode 100644 index 32a67c7..0000000 --- a/tests/test_build_structures.py +++ /dev/null @@ -1,459 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright © 2019-2020 Ralf Weber -# -# This file is part of MetaboBlend. -# -# MetaboBlend is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# MetaboBlend is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with MetaboBlend. If not, see . -# - - -import unittest -import shutil -import tempfile -from metaboblend.build_structures import * -from metaboblend.databases import * - - -class BuildStructuresTestCase(unittest.TestCase): - temp_results_dir = None - - @classmethod - def to_test_results(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) - - @classmethod - def to_test_data(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) - - @classmethod - def setUpClass(cls): - cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) - - shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"), - cls.to_test_results("test_data")) - - def test_build(self): # core - all other build functions rely on - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - - # ref data - smis = [{'NCCc1cc(O)ccc1O', 'NCCc1cccc(O)c1O', 'NCCc1cc(O)cc(O)c1', 'NCCc1ccc(O)c(O)c1'}, - None, - {'N[C@@H](Cc1ccc(O)cc1)C(=O)O', 'N[C@@H](Cc1cccc(O)c1)C(=O)O', 'N[C@H](Cc1ccc(O)cc1)C(=O)O'}, - None] - std_lens = [4, 51, 3, 1892] - fragments = [56.05, 60.0211, 68.0262, 56.0262] - exp_lens = [1, 41, 2, 0] - - # hmdb records to build from - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - record_dicts = pickle.load(test_hmdbs) - - for i, record_dict in enumerate(record_dicts.values()): - - # test standard building - built_smis = build( - mf=[record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - exact_mass=record_dict["exact_mass"], max_n_substructures=3, - path_connectivity_db=self.to_test_data("connectivity.sqlite"), - path_substructure_db=self.to_test_data("substructures.sqlite"), clean=True, - prescribed_mass=None, ppm=None, ncpus=None, table_name="substructures", - isomeric_smiles=True, retain_substructures=True - ) - - j = 0 - - for smi in built_smis: - j += 1 - - self.assertEqual(j, std_lens[i]) - - if smis[i] is not None: - self.assertEqual(set(built_smis.keys()), smis[i]) - else: - self.assertTrue(len(built_smis.keys()) == 51 or len(built_smis.keys()) == 1892) - - # test prescribed substructure building - built_smis = build( - mf=[record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - exact_mass=record_dict["exact_mass"], max_n_substructures=3, - prescribed_mass=fragments[i], ppm=15, clean=True, - path_connectivity_db=self.to_test_data("connectivity.sqlite"), - path_substructure_db=self.to_test_data("substructures.sqlite"), - ncpus=None, table_name="substructures", isomeric_smiles=True, - retain_substructures=False - ) - - j = 0 - for smi in built_smis: - j += 1 - - if i == 2: - self.assertEqual(set(built_smis.keys()), {'N[C@@H](Cc1ccc(O)cc1)C(=O)O', 'N[C@@H](Cc1cccc(O)c1)C(=O)O'}) - - self.assertEqual(len(built_smis.keys()), exp_lens[i]) - - db.close() - - def test_substructure_combination_build(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite"), - self.to_test_data("connectivity.sqlite")) - - ec_products = [((4, 5, 0, 0, 0, 0), (4, 6, 1, 2, 0, 0)), - ((5, 5, 0, 2, 0, 0), (3, 6, 1, 0, 0, 0)), - ((2, 4, 0, 2, 0, 0), (4, 8, 0, 4, 0, 0))] - configs_iso = db.k_configs() - lens = [0, 1, 41] - - for i, ec_product in enumerate(ec_products): - substructure_subset = db.select_substructures(ec_product, "substructures") - smis = substructure_combination_build(substructure_subset, configs_iso, - prescribed_structure=False, isomeric_smiles=True, - bond_enthalpies=get_bond_enthalpies(), - retain_substructures=False) - - self.assertEqual(len(smis.keys()), lens[i]) - - if i == 1: - self.assertEqual(list(smis.keys()), ['NCCc1ccc(O)c(O)c1']) - - db.close() - - def test_build_from_subsets(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - - mcs = [[8, 11, 1, 2, 0, 0], [8, 11, 1, 2, 0, 0], [12, 22, 0, 11, 0, 0], [10, 0, 0, 0, 0, 0], - [9, 11, 1, 3, 0, 0], [9, 11, 1, 3, 0, 0], [8, 11, 1, 2, 0, 0]] - exact_subsets = [(74.0242, 79.0548), (65.0391, 88.0399), (103.0395, 119.0344, 120.0423), - (84.0449, 97.029), (50.0156, 57.0215, 74.0368), (50.0156, 57.034, 74.0242), - (50.0156, 57.0215, 74.0368)] - - lens = [1, 7, 26, 0, 4, 4, 0] - - for i, mc, exact_subset in zip(range(len(mcs)), mcs, exact_subsets): - substructure_subsets = build_from_subsets(exact_subset, mc, "substructures", db) - - if i == 3 or i == 6: - self.assertEqual(len(substructure_subsets), 0) - else: - self.assertEqual(len(substructure_subsets[0][0]), lens[i]) - - if i == 0: - del substructure_subsets[0][0][0]["mol"] - self.assertEqual(substructure_subsets[0][0], - [{'smiles': '*[C@H](N)C(=O)O', - 'bond_types': {1: [1.0]}, - 'degree_atoms': {1: 1}, - 'valence': 1, - 'atoms_available': 1, - 'dummies': [2]}]) - - db.close() - - def test_generate_structures(self): # tests vs build - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - - fragments = [56.05, 60.0211, 68.0262, 56.0262] - - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - record_dicts = pickle.load(test_hmdbs) - for i, record_dict in enumerate(record_dicts.values()): - ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - "exact_mass": record_dict["exact_mass"]}} - - # test standard building - returned_smis = list( - generate_structures(ms_data, path_substructure_db=self.to_test_data("substructures.sqlite"), - write_csv_output=True, path_out=self.to_test_results(), - max_degree=6, max_atoms_available=2, max_n_substructures=3, - path_connectivity_db=self.to_test_data("connectivity.sqlite"), - minimum_frequency=None, yield_smis=True, isomeric_smiles=True, - retain_substructures=True)) - - returned_smis = returned_smis[0][record_dict["HMDB_ID"]] - - build_smis = build( - mf=[record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - exact_mass=record_dict["exact_mass"], - max_n_substructures=3, path_connectivity_db=self.to_test_data("connectivity.sqlite"), - path_substructure_db=self.to_test_data("substructures.sqlite"), clean=True, - prescribed_mass=None, ppm=None, ncpus=None, table_name="substructures", isomeric_smiles=True, - retain_substructures=True - ) - - self.assertEqual(set(build_smis.keys()), set(returned_smis)) - - ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - "exact_mass": record_dict["exact_mass"], - "prescribed_mass": fragments[i]}} - - # test prescribed building - returned_smis = list( - generate_structures(ms_data, path_substructure_db=self.to_test_data("substructures.sqlite"), - write_csv_output=True, path_out=self.to_test_results(), - max_degree=6, max_atoms_available=2, max_n_substructures=3, - path_connectivity_db=self.to_test_data("connectivity.sqlite"), - minimum_frequency=None, yield_smis=True, isomeric_smiles=True, - retain_substructures=False)) - - returned_smis = returned_smis[0][record_dict["HMDB_ID"]] - - build_smis = build( - mf=[record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - exact_mass=record_dict["exact_mass"], max_n_substructures=3, - prescribed_mass=fragments[i], ppm=0, retain_substructures=False, - path_connectivity_db=self.to_test_data("connectivity.sqlite"), - path_substructure_db=self.to_test_data("substructures.sqlite"), - ncpus=None, table_name="substructures", clean=True, isomeric_smiles=True - ) - - self.assertEqual(set(build_smis.keys()), set(returned_smis)) - - ms_data = {} - for i, record_dict in enumerate(record_dicts.values()): - record_dict["mol"] = Chem.MolFromSmiles(record_dict["smiles"]) - ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - "exact_mass": record_dict["exact_mass"], - "prescribed_masses": None} - - # test building with multiple inputs - returned_smi_list = list( - generate_structures(ms_data, path_substructure_db=self.to_test_data("substructures.sqlite"), - write_csv_output=True, path_out=self.to_test_results(), - max_degree=6, max_atoms_available=2, max_n_substructures=3, - path_connectivity_db=self.to_test_data("connectivity.sqlite"), - minimum_frequency=None, yield_smis=True, isomeric_smiles=True, - retain_substructures=False)) - - for i, record_dict in enumerate(record_dicts.values()): - build_smis = build( - mf=[record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - exact_mass=record_dict["exact_mass"], retain_substructures=False, - max_n_substructures=3, path_connectivity_db=self.to_test_data("connectivity.sqlite"), - path_substructure_db=self.to_test_data("substructures.sqlite"), clean=True, - prescribed_mass=None, ppm=None, ncpus=None, table_name="substructures", isomeric_smiles=True - ) - - self.assertEqual(set(build_smis.keys()), set(returned_smi_list[i][record_dict["HMDB_ID"]])) - - db.close() - - def test_annotate_msn(self): # tests vs build_msn - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - - overall_lens = [3, 41, 2, 0] - smis = [{'NCCc1cc(O)ccc1O', 'NCCc1ccc(O)c(O)c1', 'NCCc1cc(O)cc(O)c1'}, - None, - {'N[C@@H](Cc1cccc(O)c1)C(=O)O', 'N[C@@H](Cc1ccc(O)cc1)C(=O)O'}, - None] - freqs = [1, 0, 0, 0] - - fragments = [56.05, 60.0211, 68.0262, 56.0262] - - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - record_dicts = pickle.load(test_hmdbs) - for i, record_dict in enumerate(record_dicts.values()): - - if not os.path.exists(self.to_test_results("annotate")): - os.mkdir(self.to_test_results("annotate")) - - ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - "exact_mass": record_dict["exact_mass"], - "neutral_fragment_masses": fragments}} - - # test standard building - returned_smis = list(annotate_msn( - ms_data, max_degree=6, max_atoms_available=2, max_n_substructures=3, - write_csv_output=True, retain_substructures=False, path_out=self.to_test_results(), - path_connectivity_db=self.to_test_data("connectivity.sqlite"), - path_substructure_db=self.to_test_data("substructures.sqlite"), - minimum_frequency=None, yield_smis=True, isomeric_smiles=True - )) - - returned_smis = returned_smis[0][record_dict["HMDB_ID"]] - - self.assertEqual(len([t[1] for t in returned_smis if t[1] > 1]), freqs[i]) - - if smis[i] is not None: - self.assertEqual(set(t[0] for t in returned_smis), smis[i]) - - if i == 0: - self.assertEqual(returned_smis[2][1], 3) - - ms_data = {} - for i, record_dict in enumerate(record_dicts.values()): - record_dict["mol"] = Chem.MolFromSmiles(record_dict["smiles"]) - ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - "exact_mass": record_dict["exact_mass"], - "neutral_fragment_masses": fragments} - - os.mkdir(self.to_test_results("annotate_multi")) - - # test building with multiple inputs - returned_smi_list = list(annotate_msn( - ms_data, max_degree=6, max_atoms_available=2, max_n_substructures=3, - path_out=self.to_test_results("annotate_multi"), write_csv_output=True, - path_connectivity_db=self.to_test_data("connectivity.sqlite"), - path_substructure_db=self.to_test_data("substructures.sqlite"), - minimum_frequency=None, yield_smis=True, - isomeric_smiles=True, retain_substructures=False - )) - - for i, record_dict in enumerate(record_dicts.values()): - self.assertEqual(len(returned_smi_list[i][record_dict["HMDB_ID"]]), overall_lens[i]) - - db.close() - - def test_results_db(self): - fragments = [56.05, 60.0211, 68.0262, 56.0262] - - with open(self.to_test_data("test_hmdbs.dictionary"), "rb") as test_hmdbs: - record_dicts = pickle.load(test_hmdbs) - - ms_data = {} - for i, record_dict in enumerate(record_dicts.values()): - record_dict["mol"] = Chem.MolFromSmiles(record_dict["smiles"]) - ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], - record_dict["O"], record_dict["P"], record_dict["S"]], - "exact_mass": record_dict["exact_mass"], - "neutral_fragment_masses": fragments} - - os.mkdir(self.to_test_results("test_results_db")) - - list(annotate_msn( - ms_data, max_degree=6, max_atoms_available=2, max_n_substructures=3, - path_out=self.to_test_results("test_results_db"), write_csv_output=True, - path_connectivity_db=self.to_test_data("connectivity.sqlite"), - path_substructure_db=self.to_test_data("substructures.sqlite"), - minimum_frequency=None, yield_smis=True, - isomeric_smiles=True, retain_substructures=True - )) - - # is the sqlite database the size we expect? - self.assertEqual(os.path.getsize(self.to_test_results("test_results_db", "metaboblend_results.sqlite")), 53248) - - # are the csv files the same as the reference? - with open(self.to_test_results("test_results_db", "metaboblend_queries.csv"), "r") as results_file, \ - open(self.to_test_data("metaboblend_queries.csv"), "r") as test_file: - - for results_line, test_line in zip(results_file, test_file): - self.assertEqual(results_line, test_line) - - with open(self.to_test_results("test_results_db", "metaboblend_structures.csv"), "r") as results_file, \ - open(self.to_test_data("metaboblend_structures.csv"), "r") as test_file: - - for results_line, test_line in zip(results_file, test_file): - self.assertEqual(results_line, test_line) - - def test_gen_subs_table(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite"), "") - table_name = gen_subs_table(db, 5, 6, 4, 2, 500) - - i = 0 - db.cursor.execute("SELECT heavy_atoms, valence, atoms_available FROM %s" % table_name) - for row in db.cursor.fetchall(): - i += 1 - - self.assertTrue(row[0] in range(5, 7)) - self.assertTrue(row[1] <= 4) - self.assertTrue(row[2] <= 2) - - self.assertEqual(i, 58) - - db.close() - - def test_subset_sum(self): # also tests find_path - self.assertEqual([s_sum for s_sum in subset_sum([1, 2, 3, 4], 5)], [[2, 3], [1, 4]]) - - self.assertEqual(len(list(subset_sum(list(range(60)), 70, 3))), 378) - self.assertEqual(len(list(subset_sum(list(range(60)), 70, 1000))), 29884) - - def test_combine_ecs(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite"), "") - self.assertEqual(combine_mfs([54.0106, 69.0578], db, "substructures", "0_0001"), - [[(3, 2, 0, 1, 0, 0)], [(4, 7, 1, 0, 0, 0)]]) - self.assertEqual(combine_mfs([54, 69], db, "substructures", "1"), - [[(3, 2, 0, 1, 0, 0)], [(4, 7, 1, 0, 0, 0), (4, 5, 0, 1, 0, 0)]]) - self.assertEqual(combine_mfs([54.0101, 69.0580], db, "substructures", "0_0001"), []) - - db.close() - - def test_reindex_atoms(self): - substructure_combinations = [ - [{'smiles': '*C(*)C(=O)O', 'mol': None, 'bond_types': {1: [1.0, 1.0]}, 'degree_atoms': {1: 2}, - 'valence': 2, 'atoms_available': 1, 'dummies': [0, 2]}, - {'smiles': 'NCCc1c:*:*:cc1', 'mol': None, 'bond_types': {4: [1.5], 6: [1.5], 7: [1.5]}, - 'degree_atoms': {4: 1, 7: 1}, 'valence': 2, 'atoms_available': 2, 'dummies': [5, 6]}], - [{'smiles': '*[C@@H](O)[C@@H](*)O', 'mol': None, 'bond_types': {1: [1.0], 3: [1.0]}, - 'degree_atoms': {1: 1, 3: 1}, 'valence': 2, 'atoms_available': 2, 'dummies': [0, 5]}, - {'smiles': 'OC1**[C@@H](O)[C@H](O)[C@H]1O', 'mol': None, 'bond_types': {0: [1.0], 3: [1.0], 4: [1.0]}, - 'degree_atoms': {0: 1, 4: 1}, 'valence': 2, 'atoms_available': 2, 'dummies': [2, 3]}], - [{'smiles': '*C[C@H](N)C(=O)O', 'mol': None, 'bond_types': {2: [1.0]}, 'degree_atoms': {2: 1}, - 'valence': 1, 'atoms_available': 1, 'dummies': [3]}, - {'smiles': '*c1ccc(O)cc1', 'mol': None, 'bond_types': {1: [1.0]}, 'degree_atoms': {1: 1}, - 'valence': 1, 'atoms_available': 1, 'dummies': [0]}] - ] - - reindexed = [ - ["*C(*)C(=O)O.NCCc1c:*:*:cc1", [1, 10, 13], [0, 2, 11, 12], - {1: [1.0, 1.0], 10: [1.5], 12: [1.5], 13: [1.5]}], - ["*[C@@H](O)[C@@H](*)O.OC1**[C@@H](O)[C@H](O)[C@H]1O", [1, 3, 6, 10], [0, 5, 8, 9], - {1: [1.0], 3: [1.0], 6: [1.0], 9: [1.0], 10: [1.0]}], - ["*C[C@H](N)C(=O)O.*c1ccc(O)cc1", [2, 8], [3, 7], {2: [1.0], 8: [1.0]}] - ] - - for substructure_combination, reindex in zip(substructure_combinations, reindexed): - substructure_combination[0]["mol"] = Chem.MolFromSmiles(substructure_combination[0]["smiles"], False) - substructure_combination[1]["mol"] = Chem.MolFromSmiles(substructure_combination[1]["smiles"], False) - - mol_comb, atoms_available, atoms_to_remove, bond_types, bond_mismatch = reindex_atoms(substructure_combination) - self.assertEqual([Chem.MolToSmiles(mol_comb), atoms_available, atoms_to_remove, bond_types], reindex) - - def test_add_bonds(self): - mol_comb = [Chem.MolFromSmiles("*C(*)C(=O)O.NCCc1c:*:*:cc1", False), - Chem.MolFromSmiles("*[C@@H](O)[C@@H](*)O.OC1**[C@@H](O)[C@H](O)[C@H]1O", False), - Chem.MolFromSmiles("*C[C@H](N)C(=O)O.*c1ccc(O)cc1", False)] - - atoms_available = [[1, 10, 13], [1, 3, 6, 10], [2, 8]] - - bond_types = [{1: [1.0, 1.0], 10: [1.5], 12: [1.5], 13: [1.5]}, - {1: [1.0], 3: [1.0], 6: [1.0], 9: [1.0], 10: [1.0]}, - {2: [1.0], 8: [1.0]}] - - edges = [((0, 1), (0, 2)), ((0, 2), (1, 3)), ((0, 1),)] - - mol_out = [None, "*[CH]1(O)OC2**[CH](O)(C(O)C2O)[CH]1(*)O", "*C[CH](N)(C(=O)O)c1(*)ccc(O)cc1"] - - for i in range(len(atoms_available)): - mol_e, total_bde = add_bonds(mol_comb[i], edges[i], atoms_available[i], bond_types[i], get_bond_enthalpies()) - - if i == 0: - self.assertTrue(mol_e is None) - else: - self.assertEqual(Chem.MolToSmiles(mol_e.GetMol(), False), mol_out[i]) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/test_data/HMDB0000263.xml b/tests/test_data/HMDB0000263.xml new file mode 100644 index 0000000..fe985f4 --- /dev/null +++ b/tests/test_data/HMDB0000263.xml @@ -0,0 +1,1354 @@ + + + + 4.0 + 2005-11-16 15:48:42 UTC + 2020-09-15 17:09:00 UTC + HMDB0000263 + quantified + + HMDB00263 + + Phosphoenolpyruvic acid + Phosphoenolpyruvate (PEP) is an important chemical compound in biochemistry. It has a high energy phosphate bond, and is involved in glycolysis and gluconeogenesis. In glycolysis, PEP is formed by the action of the enzyme enolase on 2-phosphoglycerate. Metabolism of PEP to pyruvate by pyruvate kinase (PK) generates 1 molecule of adenosine triphosphate (ATP) via substrate-level phosphorylation. ATP is one of the major currencies of chemical energy within cells. In gluconeogenesis, PEP is formed from the decarboxylation of oxaloacetate and hydrolysis of 1 guanosine triphosphate molecule. This reaction is catalyzed by the enzyme phosphoenolpyruvate carboxykinase (PEPCK). This reaction is a rate-limiting step in gluconeogenesis. (wikipedia). + + 2-(Phosphonooxy)-2-propenoic acid + 2-PHOSPHOENOLPYRUVIC ACID + PEP + PHOSPHOENOLPYRUVATE + 2-(Phosphonooxy)-2-propenoate + 2-PHOSPHOENOLPYRUVate + 2-Hydroxy-acrylic acid dihydrogen phosphate + 2-Phosphonooxyprop-2-enoate + 2-Phosphonooxyprop-2-enoic acid + p-enol-Pyruvate + PEP (phosphate) + Phosphoenolpyruvic acid + + C3H5O6P + 168.042 + 167.982374404 + 2-(phosphonooxy)prop-2-enoic acid + phosphoenolpyruvic acid + 138-08-9 + OC(=O)C(=C)OP(O)(O)=O + InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(H,4,5)(H2,6,7,8) + DTBNBXWJWCWCIK-UHFFFAOYSA-N + + belongs to the class of organic compounds known as phosphate esters. These are organic compounds containing phosphoric acid ester functional group, with the general structure R1P(=O)(R2)OR3. R1,R2 = O,N, or halogen atom; R3 = organyl group. + Phosphate esters + Organic compounds + Organic acids and derivatives + Organic phosphoric acids and derivatives + Phosphate esters + Aliphatic acyclic compounds + + Carbonyl compounds + Carboxylic acids + Hydrocarbon derivatives + Monocarboxylic acids and derivatives + Organic oxides + + + Aliphatic acyclic compound + Carbonyl group + Carboxylic acid + Carboxylic acid derivative + Hydrocarbon derivative + Monocarboxylic acid or derivatives + Organic oxide + Organic oxygen compound + Organooxygen compound + Phosphoric acid ester + + + carboxyalkyl phosphate + monocarboxylic acid + + + + + Disposition + A concept that describes the origin of a chemical, its location within an organism, or its route of exposure. + + 1 + parent + + + Route of exposure + A mean by which a chemical agent comes in contact with an organism, either under intended or unintended circumstances. + 7724 + 2 + parent + + + + + Enteral + Chemical exposure via the alimentary canal (mouth to anus). + 7743 + 3 + parent + + + + + Ingestion + Chemical exposure facilitated by entry through the mouth. + 7744 + 4 + child + + Digestion + + + + + + + + Source + Natural or synthetic origin of a chemical. + 7724 + 2 + parent + + + + + Endogenous + + 7735 + 3 + child + + + + + Food + + 7735 + 3 + child + + + + + Biological + A living organism (species or a higher taxonomy rank), in which a chemical can be found. + 7735 + 3 + parent + + + + + Animal + A living organism belonging to the kingdom animalia. it feeds on organic matter, typically having specialized sense organs and nervous system and able to respond rapidly to stimuli. + 7736 + 4 + child + + Fauna + + + + Plant + A living organism belonging to the kingdom plantea. typically, it grows in a permanent site, absorbs water and inorganic substances through its roots, and synthesizes nutrients in its leaves by photosynthesis using the green pigment chlorophyll. examples incude trees, shrubs, herbs, grasses, ferns, and mosses. + 7736 + 4 + parent + + Flora + + + + Poaceae + + 7738 + 5 + child + + Gramineae + + + + Fabaceae + + 7738 + 5 + child + + Papilionoideae + Legume + + + + Glycine max + + 7738 + 5 + child + + Soy + Soya + Soybean + Soya bean + + + + Cucurbitaceae + + 7738 + 5 + child + + Cucurbits + Gourds + + + + Theobroma cacao + + 7738 + 5 + child + + + + + + + + + + + Biological location + The physiological origin within an organism, including anatomical compnents, biofluids and excreta. + 7724 + 2 + parent + + + + + Tissue and substructures + An anatomical organizational level including multiple cells yet not comprising a complete organ . + 7725 + 3 + parent + + + + + Placenta + An organ present in some vertebrates during embryonic gestation that surrounds the fetus and provides it with nutrients and oxygen, facilitates gas and waste exchange between the fetus and mother, and provides parasitic cloaking from the mother's immune system by excretion of neurokinin b. (nci). + 7729 + 4 + child + + + + + + + Organ and components + An anatomical organizational level including multiple tissues or substructures, which enables a common biological function. + 7725 + 3 + parent + + + + + Prostate + The male reproductive accessory gland that produces prostatic fluid and is located adjacent to or around the urethra distal to the urinary bladder in mammals. (nci). + 7727 + 4 + child + + Prostate gland + + + + + + Biofluid and excreta + A liquid, semi-solid or solid material originating in the body. + 7725 + 3 + parent + + + + + Saliva + The watery fluid in the mouth made by the salivary glands. saliva moistens food to help digestion and it helps protect the mouth against infections. (nci). + 7731 + 4 + child + + + + + Blood + A liquid tissue with the primary function of transporting oxygen and carbon dioxide (nci). it supplies the tissues with nutrients, removes waste products, and contains various components of the immune system defending the body against infection. + 7731 + 4 + child + + + + + + + Subcellular + An anatomical organizational level including a component within a biological cell . + 7725 + 3 + parent + + + + + Cytoplasm + The portion of the cell contained within the plasma membrane but excluding the nucleus. + 7730 + 4 + child + + Cytoplasma + + + + Mitochondria + + 7730 + 4 + child + + + + + + + + + + + Process + Biological or chemical events, or a series thereof, leading to a known function or end-product. + + 1 + parent + + + Naturally occurring process + Naturally-occurring molecular events or a series thereof, leading to a known function or end-product. + 7659 + 2 + parent + + + + + Biological process + Biological or chemical events or a series thereof, leading to a known function or end-product within an organism. + 7660 + 3 + parent + + + + + Biochemical pathway + A linked series of chemical reactions that occur in a defined order within or between organism cells, and lead to a known function or end product. + 7661 + 4 + parent + + + + + Amino Sugar Metabolism + glycosaminoglycans (gags) or mucopolysaccharides are long unbranched polysaccharides consisting of a repeating disaccharide unit. members of the glycosaminoglycan family vary in the type of hexosamine, hexose or hexuronic acid unit they contain (e. + 7662 + 5 + child + + + + + gluconeogenesis + in animals, gluconeogenesis takes place mainly in the liver and, to a lesser extent, in the cortex of kidneys. this process occurs during periods of fasting, starvation, or intense exercise. + 7662 + 5 + child + + + + + Glycogenosis, Type IB + the accumulation of glycogen in certain organs and tissues, especially the liver, kidneys, and small intestines, impairs their ability to function normally. people with gsdi may experience delayed puberty. + 7662 + 5 + child + + + + + Glycogenosis, Type IC + the accumulation of glycogen in certain organs and tissues, especially the liver, kidneys, and small intestines, impairs their ability to function normally. people with gsdi may experience delayed puberty. + 7662 + 5 + child + + + + + glycolysis + the free energy released in this process is used to form the high energy compounds, atp (adenosine triphosphate) and nadh (reduced nicotinamide adenine dinucleotide). glycolysis is a sequence of ten reactions involving ten intermediate compounds. + 7662 + 5 + child + + + + + Pyruvate Metabolism + it is the end product of glycolysis and the starting point for gluconeogenesis, and can be generated by transamination of alanine. it can be converted by the pyruvate dehydrogenase complex to acetyl coa which can enter the tca cycle or serve as the starting point for the synthesis of long chain fatty acids, steroids, and ketone bodies. + 7662 + 5 + child + + + + + Triosephosphate isomerase + it is the most severe glycolytic enzyme defect associated with progressive neurologic dysfunction. + 7662 + 5 + child + + + + + Warburg effect + as the krebs cycle is an aerobic process, in normal cells lactate production is reserved for anaerobic conditions. however, cancer cells preferentially utilize glucose for lactate production via this “aerobic glycolysis”, even when oxygen is plentiful. + 7662 + 5 + child + + Aerobic glycolysis + + + + + + + + + + + + Role + The purpose or function assumed by a chemical, either naturally or as intended by humans . + + 1 + parent + + + Industrial application + The assumed function of a chemical utilized by human. + 7671 + 2 + parent + + + + + Pharmaceutical industry + A pharmacologic activity for which a chemical substance is utilized owing to its biological role. + 7678 + 3 + parent + + + + + Pharmaceutical + + 7679 + 4 + child + + + + + + + + + + + Solid + + + + + logp + -1.22 + ALOGPS + + + logs + -1.10 + ALOGPS + + + solubility + 13.2 g/L + ALOGPS + + + logp + -0.64 + ChemAxon + + + pka_strongest_acidic + 0.76 + ChemAxon + + + iupac + 2-(phosphonooxy)prop-2-enoic acid + ChemAxon + + + average_mass + 168.042 + ChemAxon + + + mono_mass + 167.982374404 + ChemAxon + + + smiles + OC(=O)C(=C)OP(O)(O)=O + ChemAxon + + + formula + C3H5O6P + ChemAxon + + + inchi + InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(H,4,5)(H2,6,7,8) + ChemAxon + + + inchikey + DTBNBXWJWCWCIK-UHFFFAOYSA-N + ChemAxon + + + polar_surface_area + 104.06 + ChemAxon + + + refractivity + 30.13 + ChemAxon + + + polarizability + 11.57 + ChemAxon + + + rotatable_bond_count + 3 + ChemAxon + + + acceptor_count + 5 + ChemAxon + + + donor_count + 3 + ChemAxon + + + physiological_charge + -3 + ChemAxon + + + formal_charge + 0 + ChemAxon + + + number_of_rings + 0 + ChemAxon + + + bioavailability + Yes + ChemAxon + + + rule_of_five + Yes + ChemAxon + + + ghose_filter + No + ChemAxon + + + veber_rule + No + ChemAxon + + + mddr_like_rule + No + ChemAxon + + + + + Specdb::CMs + 505 + + + Specdb::CMs + 2960 + + + Specdb::CMs + 30667 + + + Specdb::CMs + 37391 + + + Specdb::NmrOneD + 1298 + + + Specdb::NmrOneD + 4820 + + + Specdb::NmrOneD + 4821 + + + Specdb::NmrTwoD + 1010 + + + Specdb::NmrTwoD + 1246 + + + Specdb::MsMs + 457 + + + Specdb::MsMs + 458 + + + Specdb::MsMs + 459 + + + Specdb::MsMs + 3822 + + + Specdb::MsMs + 3823 + + + Specdb::MsMs + 3824 + + + Specdb::MsMs + 3825 + + + Specdb::MsMs + 3826 + + + Specdb::MsMs + 3827 + + + Specdb::MsMs + 3828 + + + Specdb::MsMs + 3829 + + + Specdb::MsMs + 3830 + + + Specdb::MsMs + 3831 + + + Specdb::MsMs + 3832 + + + Specdb::MsMs + 3833 + + + Specdb::MsMs + 3834 + + + Specdb::MsMs + 3835 + + + Specdb::MsMs + 3836 + + + Specdb::MsMs + 3837 + + + Specdb::MsMs + 3838 + + + Specdb::MsMs + 3839 + + + Specdb::MsMs + 3840 + + + Specdb::MsMs + 3841 + + + Specdb::MsMs + 3842 + + + Specdb::MsMs + 3843 + + + + + Cytoplasm + Mitochondria + + + Blood + Cellular Cytoplasm + Saliva + + + Placenta + Prostate + + + + Amino Sugar Metabolism + SMP00045 + map00520 + + + Fanconi-bickel syndrome + SMP00572 + + + + Fructose-1,6-diphosphatase deficiency + SMP00562 + + + + G(M2)-Gangliosidosis: Variant B, Tay-sachs disease + SMP00534 + + + + Gluconeogenesis + SMP00128 + map00010 + + + Glycogen Storage Disease Type 1A (GSD1A) or Von Gierke Disease + SMP00374 + + + + Glycogenosis, Type IA. Von gierke disease + SMP00581 + + + + Glycogenosis, Type IB + SMP00573 + + + + Glycogenosis, Type IC + SMP00574 + + + + Glycogenosis, Type VII. Tarui disease + SMP00531 + + + + Glycolysis + SMP00040 + map00010 + + + Leigh Syndrome + SMP00196 + + + + Phosphoenolpyruvate carboxykinase deficiency 1 (PEPCK1) + SMP00560 + + + + Primary hyperoxaluria II, PH2 + SMP00558 + + + + Pyruvate Decarboxylase E1 Component Deficiency (PDHE1 Deficiency) + SMP00334 + + + + Pyruvate Dehydrogenase Complex Deficiency + SMP00212 + + + + Pyruvate kinase deficiency + SMP00559 + + + + Pyruvate Metabolism + SMP00060 + map00620 + + + Salla Disease/Infantile Sialic Acid Storage Disease + SMP00240 + + + + Sialuria or French Type Sialuria + SMP00216 + + + + Tay-Sachs Disease + SMP00390 + + + + Triosephosphate isomerase + SMP00563 + + + + Warburg Effect + SMP00654 + + + + + + + Blood + 7.6 +/- 2.9 + uM + Newborn (0-30 days old) + Both + Normal + + + Geigy Scientific Tables, 8th Rev edition, pp. 165-177. Edited by C. Lentner, West Cadwell, N.J.: Medical education Div., Ciba-Geigy Corp., Basel, Switzerland c1981-1992. + + + + + + Blood + 17.4 +/- 3.8 + uM + Adult (>18 years old) + Both + Normal + + + Geigy Scientific Tables, 8th Rev edition, pp. 165-177. Edited by C. Lentner, West Cadwell, N.J.: Medical education Div., Ciba-Geigy Corp., Basel, Switzerland c1981-1992. + + + + + + Cellular Cytoplasm + 17.0 (15.0-19.0) + uM + Adult (>18 years old) + Both + Normal + + + Nakayama Y, Kinoshita A, Tomita M: Dynamic simulation of red blood cell metabolism and its application to the analysis of a pathological condition. Theor Biol Med Model. 2005 May 9;2:18. + 15882454 + + + + + Saliva + 2.07 +/- 1.22 + uM + Adult (>18 years old) + Female + Normal + + + Tsuruoka M, Hara J, Hirayama A, Sugimoto M, Soga T, Shankle WR, Tomita M: Capillary electrophoresis-mass spectrometry-based metabolome analysis of serum and saliva from neurodegenerative dementia patients. Electrophoresis. 2013 Oct;34(19):2865-72. doi: 10.1002/elps.201300019. Epub 2013 Sep 6. + 23857558 + + + + + Saliva + 0.219 +/- 0.312 + uM + Adult (>18 years old) + Male + Normal + Saliva samples were collected at 16:00 (n=27) + + + Sugimoto et al. (2013) Physiological and environmental parameters associated with mass spectrometry-based salivary metabolomic profiles. + + + + + + Saliva + 0.312 +/- 0.332 + uM + Adult (>18 years old) + Not Specified + Normal + Afternoon (n = 18). Saliva samples were collected by the absorbent method using salivettes. + + + Sugimoto et al. (2013) Physiological and environmental parameters associated with mass spectrometry-based salivary metabolomic profiles. + + + + + + Saliva + 0.419 +/- 0.270 + uM + Adult (>18 years old) + Not Specified + Normal + Morning (n = 86). Saliva samples were collected by the absorbent method using salivettes. + + + Sugimoto et al. (2013) Physiological and environmental parameters associated with mass spectrometry-based salivary metabolomic profiles. + + + + + + Saliva + 0.463 +/- 0.275 + uM + Adult (>18 years old) + Female + Normal + Saliva samples were collected at 16:00 (n=24) + + + Sugimoto et al. (2013) Physiological and environmental parameters associated with mass spectrometry-based salivary metabolomic profiles. + + + + + + + + Saliva + 1.94 +/- 1.62 + uM + Adult (>18 years old) + Male + Alzheimer's disease + + + Tsuruoka M, Hara J, Hirayama A, Sugimoto M, Soga T, Shankle WR, Tomita M: Capillary electrophoresis-mass spectrometry-based metabolome analysis of serum and saliva from neurodegenerative dementia patients. Electrophoresis. 2013 Oct;34(19):2865-72. doi: 10.1002/elps.201300019. Epub 2013 Sep 6. + 23857558 + + + + + Saliva + 2.45 +/- 3.70 + uM + Adult (>18 years old) + Male + Frontotemporal lobe dementia + + + Tsuruoka M, Hara J, Hirayama A, Sugimoto M, Soga T, Shankle WR, Tomita M: Capillary electrophoresis-mass spectrometry-based metabolome analysis of serum and saliva from neurodegenerative dementia patients. Electrophoresis. 2013 Oct;34(19):2865-72. doi: 10.1002/elps.201300019. Epub 2013 Sep 6. + 23857558 + + + + + Saliva + 2.36 +/- 3.13 + uM + Adult (>18 years old) + Both + Lewy body disease + + + Tsuruoka M, Hara J, Hirayama A, Sugimoto M, Soga T, Shankle WR, Tomita M: Capillary electrophoresis-mass spectrometry-based metabolome analysis of serum and saliva from neurodegenerative dementia patients. Electrophoresis. 2013 Oct;34(19):2865-72. doi: 10.1002/elps.201300019. Epub 2013 Sep 6. + 23857558 + + + + + + + Alzheimer's disease + 104300 + + + Fonteh AN, Harrington RJ, Tsai A, Liao P, Harrington MG: Free amino acid and dipeptide changes in the body fluids from Alzheimer's disease subjects. Amino Acids. 2007 Feb;32(2):213-24. Epub 2006 Oct 10. + 17031479 + + + Selley ML, Close DR, Stern SE: The effect of increased concentrations of homocysteine on the concentration of (E)-4-hydroxy-2-nonenal in the plasma and cerebrospinal fluid of patients with Alzheimer's disease. Neurobiol Aging. 2002 May-Jun;23(3):383-8. + 11959400 + + + Shetty HU, Holloway HW, Schapiro MB: Cerebrospinal fluid and plasma distribution of myo-inositol and other polyols in Alzheimer disease. Clin Chem. 1996 Feb;42(2):298-302. + 8595727 + + + Jia JP, Jia JM, Zhou WD, Xu M, Chu CB, Yan X, Sun YX: Differential acetylcholine and choline concentrations in the cerebrospinal fluid of patients with Alzheimer's disease and vascular dementia. Chin Med J (Engl). 2004 Aug;117(8):1161-4. + 15361288 + + + Redjems-Bennani N, Jeandel C, Lefebvre E, Blain H, Vidailhet M, Gueant JL: Abnormal substrate levels that depend upon mitochondrial function in cerebrospinal fluid from Alzheimer patients. Gerontology. 1998;44(5):300-4. + 9693263 + + + Walter A, Korth U, Hilgert M, Hartmann J, Weichel O, Hilgert M, Fassbender K, Schmitt A, Klein J: Glycerophosphocholine is elevated in cerebrospinal fluid of Alzheimer patients. Neurobiol Aging. 2004 Nov-Dec;25(10):1299-303. + 15465626 + + + Leoni V, Masterman T, Mousavi FS, Wretlind B, Wahlund LO, Diczfalusy U, Hillert J, Bjorkhem I: Diagnostic use of cerebral and extracerebral oxysterols. Clin Chem Lab Med. 2004 Feb;42(2):186-91. + 15061359 + + + Raskind MA, Peskind ER, Holmes C, Goldstein DS: Patterns of cerebrospinal fluid catechols support increased central noradrenergic responsiveness in aging and Alzheimer's disease. Biol Psychiatry. 1999 Sep 15;46(6):756-65. + 10494443 + + + Lovell MA, Markesbery WR: Ratio of 8-hydroxyguanine in intact DNA to free 8-hydroxyguanine is increased in Alzheimer disease ventricular cerebrospinal fluid. Arch Neurol. 2001 Mar;58(3):392-6. + 11255442 + + + Bar KJ, Franke S, Wenda B, Muller S, Kientsch-Engel R, Stein G, Sauer H: Pentosidine and N(epsilon)-(carboxymethyl)-lysine in Alzheimer's disease and vascular dementia. Neurobiol Aging. 2003 Mar-Apr;24(2):333-8. + 12498967 + + + Serot JM, Barbe F, Arning E, Bottiglieri T, Franck P, Montagne P, Nicolas JP: Homocysteine and methylmalonic acid concentrations in cerebrospinal fluid: relation with age and Alzheimer's disease. J Neurol Neurosurg Psychiatry. 2005 Nov;76(11):1585-7. + 16227558 + + + Molina JA, Jimenez-Jimenez FJ, Aguilar MV, Meseguer I, Mateos-Vega CJ, Gonzalez-Munoz MJ, de Bustos F, Porta J, Orti-Pareja M, Zurdo M, Barrios E, Martinez-Para MC: Cerebrospinal fluid levels of transition metals in patients with Alzheimer's disease. J Neural Transm (Vienna). 1998;105(4-5):479-88. + 9720975 + + + Molina JA, Jimenez-Jimenez FJ, Hernanz A, Fernandez-Vivancos E, Medina S, de Bustos F, Gomez-Escalonilla C, Sayed Y: Cerebrospinal fluid levels of thiamine in patients with Alzheimer's disease. J Neural Transm (Vienna). 2002 Jul;109(7-8):1035-44. + 12111441 + + + Bocca B, Forte G, Petrucci F, Pino A, Marchione F, Bomboi G, Senofonte O, Giubilei F, Alimonti A: Monitoring of chemical elements and oxidative damage in patients affected by Alzheimer's disease. Ann Ist Super Sanita. 2005;41(2):197-203. + 16244393 + + + Kristensen MO, Gulmann NC, Christensen JE, Ostergaard K, Rasmussen K: Serum cobalamin and methylmalonic acid in Alzheimer dementia. Acta Neurol Scand. 1993 Jun;87(6):475-81. + 8356878 + + + Abe T, Tohgi H, Isobe C, Murata T, Sato C: Remarkable increase in the concentration of 8-hydroxyguanosine in cerebrospinal fluid from patients with Alzheimer's disease. J Neurosci Res. 2002 Nov 1;70(3):447-50. + 12391605 + + + Reichman ME, Judd JT, Longcope C, Schatzkin A, Clevidence BA, Nair PP, Campbell WS, Taylor PR: Effects of alcohol consumption on plasma and urinary hormone concentrations in premenopausal women. J Natl Cancer Inst. 1993 May 5;85(9):722-7. + 8478958 + + + Hozumi I, Hasegawa T, Honda A, Ozawa K, Hayashi Y, Hashimoto K, Yamada M, Koumura A, Sakurai T, Kimura A, Tanaka Y, Satoh M, Inuzuka T: Patterns of levels of biological metals in CSF differ among neurodegenerative diseases. J Neurol Sci. 2011 Apr 15;303(1-2):95-9. doi: 10.1016/j.jns.2011.01.003. Epub 2011 Feb 2. + 21292280 + + + Motawaj M, Peoc'h K, Callebert J, Arrang JM: CSF levels of the histamine metabolite tele-methylhistamine are only slightly decreased in Alzheimer's disease. J Alzheimers Dis. 2010;22(3):861-71. doi: 10.3233/JAD-2010-100381. + 20858978 + + + Smach MA, Jacob N, Golmard JL, Charfeddine B, Lammouchi T, Ben Othman L, Dridi H, Bennamou S, Limem K: Folate and homocysteine in the cerebrospinal fluid of patients with Alzheimer's disease or dementia: a case control study. Eur Neurol. 2011;65(5):270-8. doi: 10.1159/000326301. Epub 2011 Apr 8. + 21474939 + + + Linnebank M, Popp J, Smulders Y, Smith D, Semmler A, Farkas M, Kulic L, Cvetanovska G, Blom H, Stoffel-Wagner B, Kolsch H, Weller M, Jessen F: S-adenosylmethionine is decreased in the cerebrospinal fluid of patients with Alzheimer's disease. Neurodegener Dis. 2010;7(6):373-8. doi: 10.1159/000309657. Epub 2010 Jun 3. + 20523031 + + + Rosler N, Wichart I, Jellinger KA: Clinical significance of neurobiochemical profiles in the lumbar cerebrospinal fluid of Alzheimer's disease patients. J Neural Transm (Vienna). 2001;108(2):231-46. + 11314776 + + + Sunderland T, Berrettini WH, Molchan SE, Lawlor BA, Martinez RA, Vitiello B, Tariot PN, Cohen RM: Reduced cerebrospinal fluid dynorphin A1-8 in Alzheimer's disease. Biol Psychiatry. 1991 Jul 1;30(1):81-7. + 1716470 + + + Tsuruoka M, Hara J, Hirayama A, Sugimoto M, Soga T, Shankle WR, Tomita M: Capillary electrophoresis-mass spectrometry-based metabolome analysis of serum and saliva from neurodegenerative dementia patients. Electrophoresis. 2013 Oct;34(19):2865-72. doi: 10.1002/elps.201300019. Epub 2013 Sep 6. + 23857558 + + + + + Frontotemporal dementia + 600274 + + + Tsuruoka M, Hara J, Hirayama A, Sugimoto M, Soga T, Shankle WR, Tomita M: Capillary electrophoresis-mass spectrometry-based metabolome analysis of serum and saliva from neurodegenerative dementia patients. Electrophoresis. 2013 Oct;34(19):2865-72. doi: 10.1002/elps.201300019. Epub 2013 Sep 6. + 23857558 + + + + + Lewy body disease + + + + Tsuruoka M, Hara J, Hirayama A, Sugimoto M, Soga T, Shankle WR, Tomita M: Capillary electrophoresis-mass spectrometry-based metabolome analysis of serum and saliva from neurodegenerative dementia patients. Electrophoresis. 2013 Oct;34(19):2865-72. doi: 10.1002/elps.201300019. Epub 2013 Sep 6. + 23857558 + + + + + C00074 + DB01819 + FDB031112 + 980 + + 44897 + 1005 + PHOSPHO-ENOL-PYRUVATE + C00000798 + Phosphoenolpyruvic_acid + + + + PEP + + Simon, Ethan S.; Grabowski, Sven; Whitesides, George M. Preparation of phosphoenolpyruvate from D-(-)-3-phosphoglyceric acid for use in regeneration of ATP. Journal of the American Chemical Society (1989), 111(24), 8920-1. + + + Krogh P: Role of ochratoxin in disease causation. Food Chem Toxicol. 1992 Mar;30(3):213-24. + 1618445 + + + Germaine GR, Tellefson LM: Promotion of Streptococcus mutans glucose transport by human whole saliva and parotid fluid. Infect Immun. 1985 Apr;48(1):7-13. + 3980096 + + + Schatzberger P: Maternity services. BMJ. 1992 May 23;304(6838):1382-3. + 1611358 + + + Orye E, Verhaaren H, Samuel K, van Mele B: A 46,XX,10Q+ chromosome constitution in a girl. Partial long arm duplication or insertional translocation? Humangenetik. 1975 May 26;28(1):1-8. + 1150258 + + + Landau BR, Chandramouli V, Schumann WC, Ekberg K, Kumaran K, Kalhan SC, Wahren J: Estimates of Krebs cycle activity and contributions of gluconeogenesis to hepatic glucose production in fasting healthy subjects and IDDM patients. Diabetologia. 1995 Jul;38(7):831-8. + 7556986 + + + Shirokane Y, Nakajima M, Mizusawa K: A new enzymatic assay of urinary guanidinoacetic acid. Clin Chim Acta. 1991 Oct 31;202(3):227-36. + 1667626 + + + Tannen RL: Ammonia metabolism. Am J Physiol. 1978 Oct;235(4):F265-77. + 29492 + + + Atkin BM, Buist NR, Utter MF, Leiter AB, Banker BQ: Pyruvate carboxylase deficiency and lactic acidosis in a retarded child without Leigh's disease. Pediatr Res. 1979 Feb;13(2):109-16. + 219411 + + + Bojarska-Dahlig H, Gloabski T, Dzioegielewska I: [Salts of cyclic erythromycin A carbonate with cinnamic acid derivatives]. Acta Pol Pharm. 1975;32(3):311-7. + 1155186 + + + Matsumoto T, van der Auwera P, Watanabe Y, Tanaka M, Ogata N, Naito S, Kumazawa J: Neutrophil function in hyperosmotic NaCl is preserved by phosphoenol pyruvate. Urol Res. 1991;19(4):223-7. + 1656579 + + + Nakayama Y, Kinoshita A, Tomita M: Dynamic simulation of red blood cell metabolism and its application to the analysis of a pathological condition. Theor Biol Med Model. 2005 May 9;2:18. + 15882454 + + + Cahill GF Jr, Aoki TT: Renal gluconeogenesis and amino-acid metabolism in man. Med Clin North Am. 1975 May;59(3):751-61. + 1092934 + + + Beyer C: Creatine measurement in serum and urine with an automated enzymatic method. Clin Chem. 1993 Aug;39(8):1613-9. + 8353946 + + + Momeni N, Yoshimoto T, Ryberg B, Sandberg-Wollheim M, Grubb A: Factors influencing analysis of prolyl endopeptidase in human blood and cerebrospinal fluid: increase in assay sensitivity. Scand J Clin Lab Invest. 2003;63(6):387-95. + 14594319 + + + Sreekumar A, Poisson LM, Rajendiran TM, Khan AP, Cao Q, Yu J, Laxman B, Mehra R, Lonigro RJ, Li Y, Nyati MK, Ahsan A, Kalyana-Sundaram S, Han B, Cao X, Byun J, Omenn GS, Ghosh D, Pennathur S, Alexander DC, Berger A, Shuster JR, Wei JT, Varambally S, Beecher C, Chinnaiyan AM: Metabolomic profiles delineate potential role for sarcosine in prostate cancer progression. Nature. 2009 Feb 12;457(7231):910-4. doi: 10.1038/nature07762. + 19212411 + + + Elshenawy S, Pinney SE, Stuart T, Doulias PT, Zura G, Parry S, Elovitz MA, Bennett MJ, Bansal A, Strauss JF 3rd, Ischiropoulos H, Simmons RA: The Metabolomic Signature of the Placenta in Spontaneous Preterm Birth. Int J Mol Sci. 2020 Feb 4;21(3). pii: ijms21031043. doi: 10.3390/ijms21031043. + 32033212 + + + + + HMDBP00333 + Sialic acid synthase + Q9NR45 + NANS + Enzyme + + + HMDBP00734 + 6-phosphofructokinase type C + Q01813 + PFKP + Unknown + + + HMDBP00758 + 6-phosphofructokinase, liver type + P17858 + PFKL + Unknown + + + HMDBP00762 + 6-phosphofructokinase, muscle type + P08237 + PFKM + Unknown + + + HMDBP00763 + Pyruvate kinase isozymes M1/M2 + P14618 + PKM + Unknown + + + HMDBP00765 + Pyruvate kinase isozymes R/L + P30613 + PKLR + Unknown + + + HMDBP00890 + Phosphoenolpyruvate carboxykinase [GTP], mitochondrial + Q16822 + PCK2 + Unknown + + + HMDBP00892 + Phosphoenolpyruvate carboxykinase, cytosolic [GTP] + P35558 + PCK1 + Unknown + + + HMDBP01085 + Beta-enolase + P13929 + ENO3 + Unknown + + + HMDBP01086 + Gamma-enolase + P09104 + ENO2 + Unknown + + + HMDBP01087 + Alpha-enolase + P06733 + ENO1 + Unknown + + + HMDBP05558 + Solute carrier organic anion transporter family member 2A1 + Q92959 + SLCO2A1 + Transporter + + + + \ No newline at end of file diff --git a/tests/test_data/HMDB0001245.xml b/tests/test_data/HMDB0001245.xml new file mode 100644 index 0000000..0ea6d74 --- /dev/null +++ b/tests/test_data/HMDB0001245.xml @@ -0,0 +1,735 @@ + + + + 4.0 + 2005-11-16 15:48:42 UTC + 2020-02-26 21:23:06 UTC + HMDB0001245 + expected + + HMDB01245 + + dCDP + dCDP is a substrate for Uridine-cytidine kinase 1, Nucleoside diphosphate kinase (mitochondrial), Nucleoside diphosphate kinase homolog 5, Ribonucleoside-diphosphate reductase large subunit, Nucleoside diphosphate kinase A, Nucleoside diphosphate kinase 7, Ribonucleoside-diphosphate reductase M2 chain, Nucleoside diphosphate kinase B, Nucleoside diphosphate kinase 3, Nucleoside diphosphate kinase 6 and UMP-CMP kinase. + + 2'-Deoxycytidine 5'-diphosphate + 2'-Deoxycytidine diphosphate + D-1beta-Ribofuranosylcytosine diphosphate + Deoxycytidine diphosphate + 2'-Deoxycytidine 5'-diphosphoric acid + 2'-Deoxycytidine diphosphoric acid + D-1b-Ribofuranosylcytosine diphosphate + D-1b-Ribofuranosylcytosine diphosphoric acid + D-1beta-Ribofuranosylcytosine diphosphoric acid + D-1Β-ribofuranosylcytosine diphosphate + D-1Β-ribofuranosylcytosine diphosphoric acid + Deoxycytidine diphosphoric acid + 2'-Deoxy-cytidine 5'-pyrophosphate + 2'-Deoxy-cytidine pyrophosphate + 2'-Deoxycytidine-5'-diphosphate + 4-Amino-1-[2-deoxy-5-O-[hydroxy(phosphonooxy)phosphinyl]-beta-D-erythro-pentofuranosyl]-2(1H)-pyrimidinone + 4-Amino-1-[2-deoxy-5-O-[hydroxy(phosphonooxy)phosphinyl]-beta-delta-erythro-pentofuranosyl]-2(1H)-pyrimidinone + delta-1beta-Ribofuranosylcytosine diphosphate + Deoxy-CDP + Deoxycytidine 5'-diphosphate + + C9H15N3O10P2 + 387.177 + 387.023266739 + [({[(2R,3S,5R)-5-(4-amino-2-oxo-1,2-dihydropyrimidin-1-yl)-3-hydroxyoxolan-2-yl]methoxy}(hydroxy)phosphoryl)oxy]phosphonic acid + dCDP + 800-73-7 + NC1=NC(=O)N(C=C1)[C@H]1C[C@H](O)[C@@H](COP(O)(=O)OP(O)(O)=O)O1 + InChI=1S/C9H15N3O10P2/c10-7-1-2-12(9(14)11-7)8-3-5(13)6(21-8)4-20-24(18,19)22-23(15,16)17/h1-2,5-6,8,13H,3-4H2,(H,18,19)(H2,10,11,14)(H2,15,16,17)/t5-,6+,8+/m0/s1 + FTDHDKPUHBLBTL-SHYZEUOFSA-N + + belongs to the class of organic compounds known as organic pyrophosphates. These are organic compounds containing the pyrophosphate oxoanion, with the structure OP([O-])(=O)OP(O)([O-])=O. + Organic pyrophosphates + Organic compounds + Organic oxygen compounds + Organic oxoanionic compounds + Organic pyrophosphates + Aromatic heteromonocyclic compounds + + Aminopyrimidines and derivatives + Azacyclic compounds + Heteroaromatic compounds + Hydrocarbon derivatives + Hydropyrimidines + Imidolactams + Monoalkyl phosphates + Organic oxides + Organopnictogen compounds + Oxacyclic compounds + Primary amines + Pyrimidones + Secondary alcohols + Tetrahydrofurans + + + Alcohol + Alkyl phosphate + Amine + Aminopyrimidine + Aromatic heteromonocyclic compound + Azacycle + Heteroaromatic compound + Hydrocarbon derivative + Hydropyrimidine + Imidolactam + Monoalkyl phosphate + Organic nitrogen compound + Organic oxide + Organic phosphoric acid derivative + Organic pyrophosphate + Organoheterocyclic compound + Organonitrogen compound + Organooxygen compound + Organopnictogen compound + Oxacycle + Phosphoric acid ester + Primary amine + Pyrimidine + Pyrimidone + Secondary alcohol + Tetrahydrofuran + + + 2'-deoxycytidine phosphate + Deoxyribonucleotides + pyrimidine 2'-deoxyribonucleoside 5'-diphosphate + + + + + Disposition + A concept that describes the origin of a chemical, its location within an organism, or its route of exposure. + + 1 + parent + + + Route of exposure + A mean by which a chemical agent comes in contact with an organism, either under intended or unintended circumstances. + 7724 + 2 + parent + + + + + Enteral + Chemical exposure via the alimentary canal (mouth to anus). + 7743 + 3 + parent + + + + + Ingestion + Chemical exposure facilitated by entry through the mouth. + 7744 + 4 + child + + Digestion + + + + + + + + Source + Natural or synthetic origin of a chemical. + 7724 + 2 + parent + + + + + Endogenous + + 7735 + 3 + child + + + + + Food + + 7735 + 3 + child + + + + + Biological + A living organism (species or a higher taxonomy rank), in which a chemical can be found. + 7735 + 3 + parent + + + + + Plant + A living organism belonging to the kingdom plantea. typically, it grows in a permanent site, absorbs water and inorganic substances through its roots, and synthesizes nutrients in its leaves by photosynthesis using the green pigment chlorophyll. examples incude trees, shrubs, herbs, grasses, ferns, and mosses. + 7736 + 4 + parent + + Flora + + + + Poaceae + + 7738 + 5 + child + + Gramineae + + + + Fabaceae + + 7738 + 5 + child + + Papilionoideae + Legume + + + + Glycine max + + 7738 + 5 + child + + Soy + Soya + Soybean + Soya bean + + + + Cucurbitaceae + + 7738 + 5 + child + + Cucurbits + Gourds + + + + Theobroma cacao + + 7738 + 5 + child + + + + + + + Animal + A living organism belonging to the kingdom animalia. it feeds on organic matter, typically having specialized sense organs and nervous system and able to respond rapidly to stimuli. + 7736 + 4 + child + + Fauna + + + + + + + + Biological location + The physiological origin within an organism, including anatomical compnents, biofluids and excreta. + 7724 + 2 + parent + + + + + Subcellular + An anatomical organizational level including a component within a biological cell . + 7725 + 3 + parent + + + + + Mitochondria + + 7730 + 4 + child + + + + + Nucleus + A body within the cell, surrounded by a membrane, within which lie the chromosomes, one or more nucleoli, combined with proteins, and exhibits mitosis. (NCI) + 7730 + 4 + child + + Cell nucleus + Nucleic + + + + Cytoplasm + The portion of the cell contained within the plasma membrane but excluding the nucleus. + 7730 + 4 + child + + Cytoplasma + + + + + + + + + + Process + Biological or chemical events, or a series thereof, leading to a known function or end-product. + + 1 + parent + + + Naturally occurring process + Naturally-occurring molecular events or a series thereof, leading to a known function or end-product. + 7659 + 2 + parent + + + + + Biological process + Biological or chemical events or a series thereof, leading to a known function or end-product within an organism. + 7660 + 3 + parent + + + + + Biochemical pathway + A linked series of chemical reactions that occur in a defined order within or between organism cells, and lead to a known function or end product. + 7661 + 4 + parent + + + + + MNGIE (Mitochondrial Neurogastrointestinal Encephalopathy) + mngie causes accumulation of thymidine and deoxyuridine in the urine. symptoms of mngie include ptosis, progressive external ophthalmoplegia, gastrointestinal dysmotility (often pseudoobstruction), diffuse leukoencephalopathy, peripheral neuropathy, and myopathy. + 7662 + 5 + child + + + + + Pyrimidine Metabolism + cytosine, thymine, and uracil are pyrimidine derivatives. synthesis of the pyrimidines is less complex than that of the purines, since the base is much simpler this pathway depicts a number of processes including pyrimidine nucleotide biosynthesis, pyrimidine degradation and pyrimidine salvage. + 7662 + 5 + child + + + + + + + + + + + + + Solid + + + + + logp + -1.48 + ALOGPS + + + logs + -1.53 + ALOGPS + + + solubility + 11.3 g/L + ALOGPS + + + logp + -3 + ChemAxon + + + pka_strongest_acidic + 1.78 + ChemAxon + + + pka_strongest_basic + -0.005 + ChemAxon + + + iupac + [({[(2R,3S,5R)-5-(4-amino-2-oxo-1,2-dihydropyrimidin-1-yl)-3-hydroxyoxolan-2-yl]methoxy}(hydroxy)phosphoryl)oxy]phosphonic acid + ChemAxon + + + average_mass + 387.177 + ChemAxon + + + mono_mass + 387.023266739 + ChemAxon + + + smiles + NC1=NC(=O)N(C=C1)[C@H]1C[C@H](O)[C@@H](COP(O)(=O)OP(O)(O)=O)O1 + ChemAxon + + + formula + C9H15N3O10P2 + ChemAxon + + + inchi + InChI=1S/C9H15N3O10P2/c10-7-1-2-12(9(14)11-7)8-3-5(13)6(21-8)4-20-24(18,19)22-23(15,16)17/h1-2,5-6,8,13H,3-4H2,(H,18,19)(H2,10,11,14)(H2,15,16,17)/t5-,6+,8+/m0/s1 + ChemAxon + + + inchikey + FTDHDKPUHBLBTL-SHYZEUOFSA-N + ChemAxon + + + polar_surface_area + 201.44 + ChemAxon + + + refractivity + 74.78 + ChemAxon + + + polarizability + 30.77 + ChemAxon + + + rotatable_bond_count + 6 + ChemAxon + + + acceptor_count + 10 + ChemAxon + + + donor_count + 5 + ChemAxon + + + physiological_charge + -2 + ChemAxon + + + formal_charge + 0 + ChemAxon + + + number_of_rings + 2 + ChemAxon + + + bioavailability + Yes + ChemAxon + + + rule_of_five + Yes + ChemAxon + + + ghose_filter + No + ChemAxon + + + veber_rule + No + ChemAxon + + + mddr_like_rule + No + ChemAxon + + + + + Specdb::CMs + 25964 + + + Specdb::CMs + 37992 + + + Specdb::MsMs + 27746 + + + Specdb::MsMs + 27747 + + + Specdb::MsMs + 27748 + + + Specdb::MsMs + 34304 + + + Specdb::MsMs + 34305 + + + Specdb::MsMs + 34306 + + + Specdb::MsMs + 439044 + + + Specdb::MsMs + 440125 + + + Specdb::MsMs + 447974 + + + Specdb::MsMs + 447975 + + + + + Mitochondria + Nucleus + + + + + + + + Beta Ureidopropionase Deficiency + SMP00172 + + + + Dihydropyrimidinase Deficiency + SMP00178 + + + + MNGIE (Mitochondrial Neurogastrointestinal Encephalopathy) + SMP00202 + + + + Pyrimidine Metabolism + SMP00046 + map00240 + + + UMP Synthase Deficiency (Orotic Aciduria) + SMP00219 + + + + + + + + + + + C00705 + FDB022510 + + 150855 + 132961 + + 28846 + DCDP + + + 50858 + Deoxycytidine_diphosphate + 6105 + Nara, Takashi; Misawa, Masanaru. Bacterial phosphorylation of 5'-deoxycytidine monophosphate to di-or triphosphate. Jpn. Tokkyo Koho (1971), 2 pp. + + + Chiu TH, Morimoto H, Baker JJ: Biosynthesis and characterization of phosphatidylglycerophosphoglycerol, a possible intermediate in lipoteichoic acid biosynthesis in Streptococcus sanguis. Biochim Biophys Acta. 1993 Feb 24;1166(2-3):222-8. + 8443240 + + + + + HMDBP00003 + UMP-CMP kinase + P30085 + CMPK1 + Unknown + + + HMDBP00105 + Uridine-cytidine kinase 1 + Q9HA47 + UCK1 + Unknown + + + HMDBP00106 + Nucleoside diphosphate kinase, mitochondrial + O00746 + NME4 + Unknown + + + HMDBP00108 + Ribonucleoside-diphosphate reductase large subunit + P23921 + RRM1 + Enzyme + + + HMDBP00109 + Nucleoside diphosphate kinase A + P15531 + NME1 + Unknown + + + HMDBP00110 + Nucleoside diphosphate kinase 7 + Q9Y5B8 + NME7 + Unknown + + + HMDBP00111 + Ribonucleoside-diphosphate reductase subunit M2 + P31350 + RRM2 + Unknown + + + HMDBP00112 + Nucleoside diphosphate kinase B + P22392 + NME2 + Unknown + + + HMDBP00113 + Nucleoside diphosphate kinase 3 + Q13232 + NME3 + Unknown + + + HMDBP00114 + Nucleoside diphosphate kinase 6 + O75414 + NME6 + Unknown + + + HMDBP02576 + Uridine-cytidine kinase 2 + Q9BZX2 + UCK2 + Unknown + + + HMDBP07407 + Ribonucleoside-diphosphate reductase subunit M2 B + Q7LG56 + RRM2B + Unknown + + + HMDBP07409 + UMP-CMP kinase 2, mitochondrial + Q5EBM0 + CMPK2 + Enzyme + + + HMDBP07410 + Uridine-cytidine kinase-like 1 + Q9NWZ5 + UCKL1 + Enzyme + + + HMDBP11852 + Nucleoside diphosphate kinase homolog 5 + P56597 + NME5 + Unknown + + + + \ No newline at end of file diff --git a/tests/test_data/metaboblend_queries.csv b/tests/test_data/metaboblend_queries.csv index 25e3afa..77de88e 100644 --- a/tests/test_data/metaboblend_queries.csv +++ b/tests/test_data/metaboblend_queries.csv @@ -1,4 +1,4 @@ -ms_id,exact_mass,C,H,N,O,P,S,ppm,ha_min,ha_max,max_atoms_available,max_degree,max_n_substructures,hydrogenation_allowance,isomeric_smiles +ms_id_num,ms_id,exact_mass,C,H,N,O,P,S,ppm,ha_min,ha_max,max_atoms_available,max_degree,max_n_substructures,hydrogenation_allowance,isomeric_smiles 0,HMDB0000073,153.078979,8,11,1,2,0,0,5,,,2,6,3,2,1 1,HMDB0000122,180.06339,6,12,0,6,0,0,5,,,2,6,3,2,1 2,HMDB0000158,181.073894,9,11,1,3,0,0,5,,,2,6,3,2,1 diff --git a/tests/test_data/metaboblend_structures.csv b/tests/test_data/metaboblend_structures.csv index 73cbf77..b3349b5 100644 --- a/tests/test_data/metaboblend_structures.csv +++ b/tests/test_data/metaboblend_structures.csv @@ -1,47 +1,47 @@ -ms_id,smiles,frequency,exact_mass,C,H,N,O,P,S -0,NCCc1cc(O)cc(O)c1,1 -0,NCCc1cc(O)ccc1O,1 -0,NCCc1ccc(O)c(O)c1,3 -1,OC1C(O)[C@H](O)[C@@H](O)[C@H](O)[C@H]1O,1 -1,OC1[C@H](O)C(O)[C@H](O)[C@@H](O)[C@@H]1O,1 -1,OC1[C@H](O)[C@@H](O)[C@@H](O)[C@H](O)[C@H]1O,1 -1,OC1[C@H](O)[C@@H](O)[C@H](O)[C@@H](O)[C@@H]1O,1 -1,OC1[C@H](O)[C@H](O)[C@@H](O)[C@H](O)[C@H]1O,1 -1,OC1[C@H](O)[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O,1 -1,OC[C@H]1OC(O)C(O)[C@H](O)[C@@H]1O,1 -1,OC[C@H]1OC(O)O[C@H](CO)C1O,1 -1,OC[C@H]1OC(O)[C@@H](O)C(O)[C@@H]1O,1 -1,OC[C@H]1OC(O)[C@@H](O)[C@@H](O)[C@@H]1O,1 -1,OC[C@H]1OC(O)[C@@H](O)[C@H](O)[C@@H]1O,1 -1,OC[C@H]1OC(O)[C@H](O)C(O)[C@@H]1O,1 -1,OC[C@H]1OC(O)[C@H](O)O[C@@H]1CO,1 -1,OC[C@H]1OC(O)[C@H](O)[C@@H](CO)O1,1 -1,OC[C@H]1OC(O)[C@H](O)[C@@H](O)C1O,1 -1,OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O,1 -1,OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@H]1O,1 -1,OC[C@H]1OC(O)[C@H](O)[C@H](O)[C@@H]1O,1 -1,OC[C@H]1OC(O)[C@H](O)[C@H](O)[C@H]1O,1 -1,OC[C@H]1OO[C@H](CO)[C@@H](O)[C@@H]1O,1 -1,OC[C@H]1OO[C@H](CO)[C@H](O)[C@@H]1O,1 -1,OC[C@H]1O[C@@H](O)C(O)[C@@H](O)[C@@H]1O,1 -1,OC[C@H]1O[C@@H](O)C(O)[C@@H](O)[C@H]1O,1 -1,OC[C@H]1O[C@@H](O)[C@@H](O)[C@@H](CO)O1,1 -1,OC[C@H]1O[C@@H](O)[C@@H](O)[C@@H](O)[C@@H]1O,1 -1,OC[C@H]1O[C@@H](O)[C@@H](O)[C@@H](O)[C@H]1O,1 -1,OC[C@H]1O[C@@H](O)[C@H](O)[C@@H](CO)O1,1 -1,OC[C@H]1O[C@@H](O)[C@H](O)[C@@H](O)C1O,1 -1,OC[C@H]1O[C@@H](O)[C@H](O)[C@@H](O)[C@@H]1O,1 -1,OC[C@H]1O[C@@H](O)[C@H](O)[C@@H](O)[C@H]1O,1 -1,OC[C@H]1O[C@H](O)[C@@H](CO)OC1O,1 -1,OC[C@H]1O[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1O,1 -1,OC[C@H]1O[C@H](O)[C@@H](O)[C@@H](O)[C@H]1O,1 -1,OC[C@H]1O[C@H](O)[C@H](O)C(O)[C@@H]1O,1 -1,OC[C@H]1O[C@H](O)[C@H](O)O[C@@H]1CO,1 -1,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](CO)O1,1 -1,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)C1O,1 -1,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O,1 -1,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@H]1O,1 -1,OC[C@H]1O[C@H](O)[C@H](O)[C@H](O)[C@@H]1O,1 -1,OC[C@H]1O[C@H](O)[C@H](O)[C@H](O)[C@H]1O,1 -2,N[C@@H](Cc1ccc(O)cc1)C(=O)O,1 -2,N[C@@H](Cc1cccc(O)c1)C(=O)O,1 +ms_id,smiles,frequency,structure_score +0,1,3,0.378033008588991 +0,2,1,0.09267766952966369 +0,3,1,0.09267766952966369 +1,10,1,0.15558761401702176 +1,11,1,0.16093508811316923 +1,12,1,0.15558761401702176 +1,13,1,0.16093508811316923 +1,14,1,0.16093508811316923 +1,15,1,0.16093508811316923 +1,16,1,0.15558761401702176 +1,17,1,0.15558761401702176 +1,18,1,0.15558761401702176 +1,19,1,0.15558761401702176 +1,20,1,0.15558761401702176 +1,21,1,0.14267766952966368 +1,22,1,0.19870345724290236 +1,23,1,0.14267766952966368 +1,24,1,0.19870345724290236 +1,25,1,0.15558761401702176 +1,26,1,0.15558761401702176 +1,27,1,0.16093508811316923 +1,28,1,0.16093508811316923 +1,29,1,0.15558761401702176 +1,30,1,0.15558761401702176 +1,31,1,0.16093508811316923 +1,32,1,0.16093508811316923 +1,33,1,0.16093508811316923 +1,34,1,0.15558761401702176 +1,35,1,0.15558761401702176 +1,36,1,0.16093508811316923 +1,37,1,0.16093508811316923 +1,38,1,0.16093508811316923 +1,39,1,0.16093508811316923 +1,4,1,0.16093508811316923 +1,40,1,0.16093508811316923 +1,41,1,0.15558761401702176 +1,42,1,0.16093508811316923 +1,43,1,0.15558761401702176 +1,44,1,0.16093508811316923 +1,5,1,0.16093508811316923 +1,6,1,0.16093508811316923 +1,7,1,0.16093508811316923 +1,8,1,0.16093508811316923 +1,9,1,0.16093508811316923 +2,45,1,0.14267766952966368 +2,46,1,0.14267766952966368 diff --git a/tests/test_data/substructures.sqlite b/tests/test_data/substructures.sqlite index 612d839..1ef7835 100644 Binary files a/tests/test_data/substructures.sqlite and b/tests/test_data/substructures.sqlite differ diff --git a/tests/test_data/test_hmdbs.dictionary b/tests/test_data/test_hmdbs.dictionary index 31a3e76..f164d71 100644 Binary files a/tests/test_data/test_hmdbs.dictionary and b/tests/test_data/test_hmdbs.dictionary differ diff --git a/tests/test_parse.py b/tests/test_parse.py index 6e3c05d..19cc4ce 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright © 2019-2020 Ralf Weber +# Copyright © 2019-2020 Jack Gisby, Ralf Weber # # This file is part of MetaboBlend. # @@ -25,6 +25,7 @@ import shutil import tempfile import unittest + from metaboblend.parse import * @@ -62,7 +63,7 @@ def test_parse_msp(self): self.assertNotEqual(ms, None) self.assertEqual(ms, {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz, - "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+", + "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+", 'ion_mode': '+', "exact_mass": self.exact_mass, "neutral_fragment_masses": self.neutral_fragment_masses}) self.assertEqual(list(parse_msp(self.to_test_data("massbank_msp.txt")))[0], None) @@ -98,6 +99,7 @@ def test_parse_ms_data(self): parsed_neutral_fragment_masses_ms_dict = list(parse_ms_data({"AU101101": copy.deepcopy(neutral_fragment_masses_ms_dict)}))[0] neutral_fragment_masses_ms_dict["exact_mass"] = self.exact_mass + neutral_fragment_masses_ms_dict['ion_mode'] = "+" self.assertEqual(parsed_neutral_fragment_masses_ms_dict, neutral_fragment_masses_ms_dict) uncalculated_ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz, @@ -105,6 +107,7 @@ def test_parse_ms_data(self): parsed_uncalculated_ms_dict = list(parse_ms_data({"AU101101": copy.deepcopy(uncalculated_ms_dict)}))[0] uncalculated_ms_dict["exact_mass"] = self.exact_mass uncalculated_ms_dict["neutral_fragment_masses"] = self.neutral_fragment_masses + uncalculated_ms_dict['ion_mode'] = "+" self.assertEqual(parsed_uncalculated_ms_dict, uncalculated_ms_dict) # test with msn=False @@ -112,6 +115,7 @@ def test_parse_ms_data(self): "prescribed_mass": "m", "precursor_type": "[M+H]+"} parsed_generate_structures_dict = list(parse_ms_data({"AU101101": copy.deepcopy(generate_structures_dict)}, False))[0] generate_structures_dict["exact_mass"] = self.exact_mass + generate_structures_dict['ion_mode'] = "+" self.assertEqual(parsed_generate_structures_dict, generate_structures_dict) # test with exact mass provided @@ -153,7 +157,7 @@ def test_reformat_msp_input(self): formatted_msp_dict = {'ms_id': 'AU101101', 'mf': self.mf, 'precursor_mz': self.precursor_mz, 'fragment_mzs': self.fragment_mzs, 'precursor_type': '[M+H]+', - 'exact_mass': self.exact_mass, + 'exact_mass': self.exact_mass, 'ion_mode': '+', 'neutral_fragment_masses': self.neutral_fragment_masses} self.assertEqual(reformat_msp_input(unformatted_msp_dict), formatted_msp_dict) diff --git a/tests/test_substructure_database.py b/tests/test_substructure_database.py deleted file mode 100644 index 5c89a81..0000000 --- a/tests/test_substructure_database.py +++ /dev/null @@ -1,262 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright © 2019-2020 Ralf Weber -# -# This file is part of MetaboBlend. -# -# MetaboBlend is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# MetaboBlend is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with MetaboBlend. If not, see . -# - - -import os -import unittest -import tempfile -import shutil -from metaboblend.databases import * - - -class SubstructureDbTestCase(unittest.TestCase): - temp_results_dir = None - - @classmethod - def to_test_results(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) - - @classmethod - def to_test_data(cls, *args): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) - - @classmethod - def setUpClass(cls): - cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) - - shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"), - cls.to_test_results("test_data")) - - def test_init(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite"), - self.to_test_data("connectivity.sqlite")) - - db.cursor.execute("SELECT * FROM substructures") - first_row = db.cursor.fetchone()[0:18] - self.assertEqual(first_row, (1, '*:c(:*)CCN', 4, 10, 56, 56.05, 56.05002399999998, 3, 6, 1, 0, 0, 0, 2, - '{3: 2}', 1, '{3: [1.5, 1.5]}', '[4, 5]')) - - self.assertTrue(Chem.MolFromSmiles(first_row[1], False)) - self.assertEqual(len(db.cursor.fetchall()), 1235) - - db.cursor.execute("SELECT * FROM hmdbid_substructures") - first_row = db.cursor.fetchone() - self.assertEqual(first_row, ('HMDB0000073', 1)) - self.assertEqual(len(db.cursor.fetchall()), 1292) - - db.cursor.execute("SELECT * FROM compounds") - first_row = db.cursor.fetchone() - self.assertEqual(first_row, ('HMDB0000073', 153.078979, 'C8H11NO2', 8, 11, 1, 2, 0, 0, 'NCCC1=CC(O)=C(O)C=C1')) - self.assertEqual(len(db.cursor.fetchall()), 3) - - db.cursor.execute("SELECT * FROM graphs.subgraphs") - first_row = db.cursor.fetchone() - self.assertEqual(first_row[0:9], (1, 1, b'A_', 2, '(1, 1)', '(1, 1)', '((1,), (1,))', 2, 1)) - self.assertEqual(len(db.cursor.fetchall()), 107) - - db.close() - - def test_select_compounds(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - for i, cpd_entry in enumerate(db.select_compounds(["HMDB0000158", "HMDB0000122"])): - self.assertLessEqual(i, 2) - self.assertTrue(cpd_entry[0] == "HMDB0000158" or cpd_entry[0] == "HMDB0000122") - - db.close() - - def test_filter_hmdbid_substructures(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - db.filter_hmdbid_substructures(2) - - db.cursor.execute("SELECT COUNT(*) FROM filtered_hmdbid_substructures GROUP BY hmdbid") - for i, hmdbid_count in enumerate(db.cursor.fetchall()): - self.assertGreater(hmdbid_count[0], 1) - - self.assertEqual(i, 3) - - db.close() - - def test_generate_substructure_network(self): # also tests get_substructure_network, get_single_edge and close - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - - self.assertEqual(db.get_single_edge([3, 4, 2]), {3: {3: None, 4: 2}, 2: {3: 1, 4: 1, 2: None}, 4: {4: None}}) - - std = db.generate_substructure_network(min_node_weight=2, return_networkx=True) - - db.cursor.execute("SELECT * FROM filtered_hmdbid_substructures") - for hmdb in db.cursor.fetchall(): - - self.assertTrue(hmdb[1] in std.nodes) - - db.cursor.execute("SELECT DISTINCT substructure_id FROM filtered_hmdbid_substructures") - self.assertEqual(len(db.cursor.fetchall()), 57) - self.assertEqual(std.number_of_nodes(), 57) - - self.assertEqual(std.number_of_edges(), 1024) - - edge_count = [] - db.cursor.execute("SELECT * FROM substructure_graph") - for edge in db.cursor.fetchall(): - edge_count.append(std.get_edge_data(edge[0], edge[1])["weight"]) - - self.assertEqual(sum(edge_count), 2048) - - db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") - self.assertEqual(len(db.cursor.fetchall()), 5) - - db.cursor.execute("CREATE TABLE subset_substructures AS SELECT * FROM COMPOUNDS") - db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") - self.assertEqual(len(db.cursor.fetchall()), 6) - - db.close() - - self.assertRaises(sqlite3.ProgrammingError, lambda: db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")) - - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") - self.assertEqual(len(db.cursor.fetchall()), 4) - - db.close() - - def test_select_mass_values(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - ests = db.select_mass_values("1", [], "substructures") - exacts = db.select_mass_values("0_0001", [], "substructures") - - self.assertEqual(len(ests), 71) - self.assertEqual(len(exacts), 117) - - for exact in exacts: - self.assertTrue(round(exact) in ests) - - self.assertEqual(db.select_mass_values("0_0001", [50, 64, 73], "substructures"), - [[50.0156], [64.0313], [73.029]]) - self.assertEqual(db.select_mass_values("0_0001", [120, 87, 87], "substructures"), - [[120.0423], [87.0082, 87.0446], [87.0082, 87.0446]]) - self.assertEqual(db.select_mass_values("0_0001", [50, 64, 73], "substructures"), - [[50.0156], [64.0313], [73.029]]) - self.assertEqual(db.select_mass_values("0_0001", [55, 80, 107], "substructures"), - [[55.0184, 55.0422], [80.0262, 80.05], [107.0497, 107.0735]]) - self.assertEqual(db.select_mass_values("0_0001", [63, 63, 63], "substructures"), - [[63.0235], [63.0235], [63.0235]]) - - self.assertRaises(sqlite3.OperationalError, - lambda: db.select_mass_values("0_0001", [63, 63, 63], "substrusctures")) - db.close() - - def test_select_mfs(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - self.assertEqual(db.select_mfs(107.0735, "substructures", "0_0001"), [(7, 9, 1, 0, 0, 0)]) - self.assertEqual(db.select_mfs(107.0735, "substructures", "1"), []) - self.assertEqual(db.select_mfs(107, "substructures", "0_0001"), []) - self.assertEqual(db.select_mfs(107.0735, "substructures", "1"), []) - self.assertEqual(db.select_mfs(107, "substructures", "1"), - [(7, 9, 1, 0, 0, 0), (7, 7, 0, 1, 0, 0)]) - - self.assertRaises(sqlite3.OperationalError, - lambda: db.select_mfs(107.0735, "substrusctures", "0_0001")) - - db.close() - - def test_k_configs(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite"), - self.to_test_data("connectivity.sqlite")) - - k_configs = db.k_configs() - self.assertEqual(len(k_configs), 67) - self.assertEqual(k_configs['((1,), (1,))'], [((0, 1),)]) - self.assertEqual(k_configs['((2, 2), (2, 2), (2, 2))'], - [((0, 2), (0, 4), (1, 3), (1, 5), (2, 4), (3, 5)), - ((0, 2), (0, 5), (1, 3), (1, 4), (2, 5), (3, 4)), - ((0, 3), (0, 5), (1, 2), (1, 4), (2, 4), (3, 5)), - ((0, 3), (0, 4), (1, 2), (1, 5), (2, 5), (3, 4))]) - - db.close() - - def test_select_substructures(self): - db = SubstructureDb(self.to_test_data("substructures.sqlite")) - self.assertEqual(db.select_substructures([[2, 5, 0, 0, 0, 0]], "substructures"), []) - self.assertEqual(len(db.select_substructures([[4, 4, 0, 0, 0, 0]], "substructures")[0]), 7) - self.assertEqual(list(db.select_substructures([[4, 4, 0, 0, 0, 0]], "substructures")[0][0].keys()), - ['smiles', 'mol', 'bond_types', 'degree_atoms', 'valence', 'atoms_available', 'dummies']) - - substructures = list(db.select_substructures([[4, 4, 0, 0, 0, 0]], "substructures")[0][0].values()) - self.assertEqual([item for i, item in enumerate(substructures) if i != 1], - ['*Cc(:*)cc:*', - {1: [1.0], 2: [1.5], 5: [1.5]}, - {1: 1, 2: 1, 5: 1}, - 3, - 3, - [0, 3, 4]]) - - self.assertEqual(len(db.select_substructures([[7, 7, 0, 0, 0, 0]], "substructures")[0]), 3) - self.assertEqual(list(db.select_substructures([[7, 7, 0, 0, 0, 0]], "substructures")[0][0].keys()), - ['smiles', 'mol', 'bond_types', 'degree_atoms', 'valence', 'atoms_available', 'dummies']) - substructures = list(db.select_substructures([[7, 7, 0, 0, 0, 0]], "substructures")[0][0].values()) - self.assertEqual([item for i, item in enumerate(substructures) if i != 1], - ['*CCc1c:*:c(*)cc1', - {1: [1.0], 4: [1.5], 6: [1.5, 1.0]}, - {1: 1, 4: 1, 6: 2}, - 4, - 3, - [0, 5, 7]]) - - self.assertRaises(sqlite3.OperationalError, - lambda: db.select_substructures([[2, 5, 0, 0, 0, 0]], "substrusctures")) - db.close() - - def test_create_compound_database(self): # also tests create_indexes - db = SubstructureDb(self.to_test_results("substructures_new.sqlite")) - db.create_compound_database() - db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") - self.assertEqual(len(db.cursor.fetchall()), 3) - - db.create_indexes() - db.close() - - shutil.copyfile(self.to_test_data("substructures.sqlite"), self.to_test_results("substructures_copy.sqlite")) - db = SubstructureDb(self.to_test_results("substructures_copy.sqlite"), - self.to_test_data("connectivity.sqlite")) - db.create_indexes() - db.create_compound_database() - db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") - self.assertEqual(len(db.cursor.fetchall()), 3) - - db.cursor.execute("SELECT * FROM substructures") - self.assertEqual(len(db.cursor.fetchall()), 0) - - db.cursor.execute("SELECT * FROM hmdbid_substructures") - self.assertEqual(len(db.cursor.fetchall()), 0) - - db.cursor.execute("SELECT * FROM compounds") - self.assertEqual(len(db.cursor.fetchall()), 0) - - db.cursor.execute("SELECT * FROM graphs.subgraphs") - first_row = db.cursor.fetchone() - self.assertEqual(first_row[0:9], (1, 1, b'A_', 2, '(1, 1)', '(1, 1)', '((1,), (1,))', 2, 1)) - self.assertEqual(len(db.cursor.fetchall()), 107) - - db.create_indexes() - db.close() - - -if __name__ == '__main__': - unittest.main()