@@ -73,6 +73,47 @@ def get_ms2query_reliability_prediction(
7373 return ms2query_scores
7474
7575
76+ def create_ms2query_library (library_spectra_file : str , ms2deepscore_model_file_name : str ):
77+ """Loads in a library and saves the embeddings and top_k_tanimoto_scores"""
78+ spectrum_file_directory = Path ("/some/dir/file.txt" ).parent
79+ embedding_file_location = spectrum_file_directory / "embeddings.npz"
80+ top_k_tanimoto_score_file_location = spectrum_file_directory / "top_k_tanimoto_scores.parquet"
81+ reference_metadata_file = spectrum_file_directory / "library_metadata.parquet"
82+ if embedding_file_location .exists ():
83+ raise FileExistsError ("There is already an embedding.npy file in the directory of your library spectra" )
84+ if top_k_tanimoto_score_file_location .exists ():
85+ raise FileExistsError (
86+ "There is already an top_k_tanimoto_scores.parquet file in the directory of your library spectra"
87+ )
88+
89+ library_spectra = list (tqdm (load_spectra (library_spectra_file ), "Loading library spectra" ))
90+ library_spectra = AnnotatedSpectrumSet .create_spectrum_set (library_spectra )
91+ ms2deepscore_model = load_model (ms2deepscore_model_file_name )
92+ library_spectra .add_embeddings (ms2deepscore_model )
93+
94+ library_spectra ._embeddings .save (embedding_file_location )
95+
96+ fingerprints = Fingerprints .from_spectrum_set (library_spectra , "daylight" , 4096 )
97+ top_k_tanimoto_scores = TopKTanimotoScores .calculate_from_fingerprints (
98+ fingerprints ,
99+ fingerprints ,
100+ k = 8 ,
101+ )
102+ top_k_tanimoto_scores .save (top_k_tanimoto_score_file_location )
103+ reference_metadata = extract_metadata_from_library (
104+ library_spectra ,
105+ [
106+ "precursor_mz" ,
107+ "retention_time" ,
108+ "collision_energy" ,
109+ "compound_name" ,
110+ "smiles" ,
111+ "inchikey" ,
112+ ],
113+ )
114+ reference_metadata .to_parquet (reference_metadata_file )
115+
116+
76117def extract_metadata_from_library (spectra : AnnotatedSpectrumSet , metadata_to_collect : list ):
77118 collected_metadata = {key : [] for key in metadata_to_collect }
78119 collected_metadata ["spectrum_hashes" ] = []
0 commit comments