|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "id": "70e60490", |
| 6 | + "metadata": {}, |
| 7 | + "source": [ |
| 8 | + "# Tutorial for running MS2Query and creating your own libraries" |
| 9 | + ] |
| 10 | + }, |
| 11 | + { |
| 12 | + "cell_type": "markdown", |
| 13 | + "id": "4d564033", |
| 14 | + "metadata": {}, |
| 15 | + "source": [ |
| 16 | + "# optional: download a matchms cleaned library\n", |
| 17 | + "The code below downloads an already matchms cleaned library and an MS2DeepScore model. You can also use your own library, but make sure you know what you are doing and clean the library first. If you just have a few reference spectra, it is probably best to combine your spectra with the reference spectra below to make sure the MS2Query search works properly. " |
| 18 | + ] |
| 19 | + }, |
| 20 | + { |
| 21 | + "cell_type": "code", |
| 22 | + "execution_count": null, |
| 23 | + "id": "b7df418e", |
| 24 | + "metadata": {}, |
| 25 | + "outputs": [ |
| 26 | + { |
| 27 | + "name": "stdout", |
| 28 | + "output_type": "stream", |
| 29 | + "text": [ |
| 30 | + "The file ./zenodo_files\\data_split_inchikeys.json already exists, the file won't be downloaded\n", |
| 31 | + "The file ./zenodo_files\\merged_and_cleaned_libraries_1.mgf already exists, the file won't be downloaded\n", |
| 32 | + "The file ./zenodo_files\\ms2deepscore_model.pt already exists, the file won't be downloaded\n" |
| 33 | + ] |
| 34 | + } |
| 35 | + ], |
| 36 | + "source": [ |
| 37 | + "import requests\n", |
| 38 | + "import os\n", |
| 39 | + "from tqdm import tqdm\n", |
| 40 | + "\n", |
| 41 | + "def download_file(link, file_name):\n", |
| 42 | + " response = requests.get(link, stream=True)\n", |
| 43 | + " if os.path.exists(file_name):\n", |
| 44 | + " print(f\"The file {file_name} already exists, the file won't be downloaded\")\n", |
| 45 | + " return\n", |
| 46 | + " total_size = int(response.headers.get('content-length', 0))\n", |
| 47 | + "\n", |
| 48 | + " with open(file_name, \"wb\") as f, tqdm(desc=\"Downloading file\", total=total_size, unit='B', unit_scale=True, unit_divisor=1024,) as bar:\n", |
| 49 | + " for chunk in response.iter_content(chunk_size=1024):\n", |
| 50 | + " if chunk:\n", |
| 51 | + " f.write(chunk)\n", |
| 52 | + " bar.update(len(chunk)) # Update progress bar by the chunk size\n", |
| 53 | + "folder_to_store_zenodo_files = \"./zenodo_files\"\n", |
| 54 | + "os.makedirs(folder_to_store_zenodo_files, exist_ok=True)\n", |
| 55 | + "\n", |
| 56 | + "download_file(\"https://zenodo.org/records/16882111/files/merged_and_cleaned_libraries_1.mgf?download=1\", \n", |
| 57 | + " os.path.join(folder_to_store_zenodo_files, \"merged_and_cleaned_libraries_1.mgf\"))\n", |
| 58 | + "download_file(\"https://zenodo.org/records/17826815/files/ms2deepscore_model.pt?download=1\", \n", |
| 59 | + " os.path.join(folder_to_store_zenodo_files, \"ms2deepscore_model.pt\"))" |
| 60 | + ] |
| 61 | + }, |
| 62 | + { |
| 63 | + "cell_type": "markdown", |
| 64 | + "id": "fa99d2dd", |
| 65 | + "metadata": {}, |
| 66 | + "source": [ |
| 67 | + "# Specify file location \n", |
| 68 | + "Replace with your file names" |
| 69 | + ] |
| 70 | + }, |
| 71 | + { |
| 72 | + "cell_type": "code", |
| 73 | + "execution_count": null, |
| 74 | + "id": "6db579a9", |
| 75 | + "metadata": {}, |
| 76 | + "outputs": [], |
| 77 | + "source": [ |
| 78 | + "library_spectra_file = os.path.join(folder_to_store_zenodo_files, \"merged_and_cleaned_libraries_1.mgf\")\n", |
| 79 | + "ms2deepscore_model_file_name = os.path.join(folder_to_store_zenodo_files, \"ms2deepscore_model.pt\")\n", |
| 80 | + "query_spectrum_file = \"replace_with_your_lib_spectra.mgf\"" |
| 81 | + ] |
| 82 | + }, |
| 83 | + { |
| 84 | + "cell_type": "code", |
| 85 | + "execution_count": null, |
| 86 | + "id": "f521c1c4", |
| 87 | + "metadata": {}, |
| 88 | + "outputs": [ |
| 89 | + { |
| 90 | + "name": "stderr", |
| 91 | + "output_type": "stream", |
| 92 | + "text": [ |
| 93 | + "1017531it [09:51, 1720.67it/s]\n" |
| 94 | + ] |
| 95 | + } |
| 96 | + ], |
| 97 | + "source": [ |
| 98 | + "from matchms.importing import load_from_mgf\n", |
| 99 | + "from tqdm import tqdm\n", |
| 100 | + "\n", |
| 101 | + "library_spectra = list(tqdm(load_from_mgf(library_spectra_file)))\n", |
| 102 | + "query_spectra = list(tqdm(load_from_mgf(query_spectrum_file)))" |
| 103 | + ] |
| 104 | + }, |
| 105 | + { |
| 106 | + "cell_type": "markdown", |
| 107 | + "id": "64534dd6", |
| 108 | + "metadata": {}, |
| 109 | + "source": [ |
| 110 | + "# Create the reference library files\n", |
| 111 | + "The code below will precompute everything needed to run MS2Query. It will save this in the same folder as your ms2deepscore model. \n", |
| 112 | + "The files created are \"embeddings.npz\", \"top_k_tanimoto_scores.parquet\", \"library_metadata.parquet\". " |
| 113 | + ] |
| 114 | + }, |
| 115 | + { |
| 116 | + "cell_type": "code", |
| 117 | + "execution_count": null, |
| 118 | + "id": "669a489a", |
| 119 | + "metadata": {}, |
| 120 | + "outputs": [], |
| 121 | + "source": [ |
| 122 | + "from ms2query.ms2query_development.ReferenceLibrary import ReferenceLibrary\n", |
| 123 | + "reference_library = ReferenceLibrary.create_from_spectra(library_spectra, ms2deepscore_model_file_name)" |
| 124 | + ] |
| 125 | + }, |
| 126 | + { |
| 127 | + "cell_type": "markdown", |
| 128 | + "id": "dfd0a633", |
| 129 | + "metadata": {}, |
| 130 | + "source": [ |
| 131 | + "# Run MS2Query\n", |
| 132 | + "The code above only has to be run once after that you can load the library faster from the saved files. " |
| 133 | + ] |
| 134 | + }, |
| 135 | + { |
| 136 | + "cell_type": "code", |
| 137 | + "execution_count": null, |
| 138 | + "id": "e6cf44e2", |
| 139 | + "metadata": {}, |
| 140 | + "outputs": [], |
| 141 | + "source": [ |
| 142 | + "# no need to run if you just created the libary above\n", |
| 143 | + "reference_library = ReferenceLibrary.load_from_directory(folder_to_store_zenodo_files)" |
| 144 | + ] |
| 145 | + }, |
| 146 | + { |
| 147 | + "cell_type": "code", |
| 148 | + "execution_count": null, |
| 149 | + "id": "566b2b0d", |
| 150 | + "metadata": {}, |
| 151 | + "outputs": [], |
| 152 | + "source": [ |
| 153 | + "results = reference_library.run_ms2query(query_spectra)" |
| 154 | + ] |
| 155 | + }, |
| 156 | + { |
| 157 | + "cell_type": "markdown", |
| 158 | + "id": "98ce9660", |
| 159 | + "metadata": {}, |
| 160 | + "source": [ |
| 161 | + "print(results)" |
| 162 | + ] |
| 163 | + }, |
| 164 | + { |
| 165 | + "cell_type": "code", |
| 166 | + "execution_count": null, |
| 167 | + "id": "b7e3b533", |
| 168 | + "metadata": {}, |
| 169 | + "outputs": [], |
| 170 | + "source": [ |
| 171 | + "results.to_csv(\"ms2query_results.csv\")" |
| 172 | + ] |
| 173 | + } |
| 174 | + ], |
| 175 | + "metadata": { |
| 176 | + "kernelspec": { |
| 177 | + "display_name": "ms2query_2", |
| 178 | + "language": "python", |
| 179 | + "name": "python3" |
| 180 | + }, |
| 181 | + "language_info": { |
| 182 | + "codemirror_mode": { |
| 183 | + "name": "ipython", |
| 184 | + "version": 3 |
| 185 | + }, |
| 186 | + "file_extension": ".py", |
| 187 | + "mimetype": "text/x-python", |
| 188 | + "name": "python", |
| 189 | + "nbconvert_exporter": "python", |
| 190 | + "pygments_lexer": "ipython3", |
| 191 | + "version": "3.11.14" |
| 192 | + } |
| 193 | + }, |
| 194 | + "nbformat": 4, |
| 195 | + "nbformat_minor": 5 |
| 196 | +} |
0 commit comments