Skip to content

Commit 711bd02

Browse files
committed
Make create_ms2query_library method
1 parent fb0107d commit 711bd02

File tree

1 file changed

+41
-0
lines changed

1 file changed

+41
-0
lines changed

ms2query/run_ms2query.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,47 @@ def get_ms2query_reliability_prediction(
7373
return ms2query_scores
7474

7575

76+
def create_ms2query_library(library_spectra_file: str, ms2deepscore_model_file_name: str):
77+
"""Loads in a library and saves the embeddings and top_k_tanimoto_scores"""
78+
spectrum_file_directory = Path("/some/dir/file.txt").parent
79+
embedding_file_location = spectrum_file_directory / "embeddings.npz"
80+
top_k_tanimoto_score_file_location = spectrum_file_directory / "top_k_tanimoto_scores.parquet"
81+
reference_metadata_file = spectrum_file_directory / "library_metadata.parquet"
82+
if embedding_file_location.exists():
83+
raise FileExistsError("There is already an embedding.npy file in the directory of your library spectra")
84+
if top_k_tanimoto_score_file_location.exists():
85+
raise FileExistsError(
86+
"There is already an top_k_tanimoto_scores.parquet file in the directory of your library spectra"
87+
)
88+
89+
library_spectra = list(tqdm(load_spectra(library_spectra_file), "Loading library spectra"))
90+
library_spectra = AnnotatedSpectrumSet.create_spectrum_set(library_spectra)
91+
ms2deepscore_model = load_model(ms2deepscore_model_file_name)
92+
library_spectra.add_embeddings(ms2deepscore_model)
93+
94+
library_spectra._embeddings.save(embedding_file_location)
95+
96+
fingerprints = Fingerprints.from_spectrum_set(library_spectra, "daylight", 4096)
97+
top_k_tanimoto_scores = TopKTanimotoScores.calculate_from_fingerprints(
98+
fingerprints,
99+
fingerprints,
100+
k=8,
101+
)
102+
top_k_tanimoto_scores.save(top_k_tanimoto_score_file_location)
103+
reference_metadata = extract_metadata_from_library(
104+
library_spectra,
105+
[
106+
"precursor_mz",
107+
"retention_time",
108+
"collision_energy",
109+
"compound_name",
110+
"smiles",
111+
"inchikey",
112+
],
113+
)
114+
reference_metadata.to_parquet(reference_metadata_file)
115+
116+
76117
def extract_metadata_from_library(spectra: AnnotatedSpectrumSet, metadata_to_collect: list):
77118
collected_metadata = {key: [] for key in metadata_to_collect}
78119
collected_metadata["spectrum_hashes"] = []

0 commit comments

Comments
 (0)