|
| 1 | +# tests/test_library_workflow.py |
| 2 | +from pathlib import Path |
| 3 | +import numpy as np |
| 4 | +import pytest |
| 5 | +from matchms.importing import load_spectra |
| 6 | +from ms2query import MS2QueryDatabase |
| 7 | +from ms2query.library_io import create_new_library, load_created_library |
| 8 | + |
| 9 | + |
| 10 | +TEST_COMP_ID = "ZBSGKPYXQINNGF" # expected InChIKey14 present in the test data |
| 11 | +EXPECTED_METADATA_SHAPE = (5, 11) |
| 12 | +EXPECTED_METADATA_FIELDS = [ |
| 13 | + "precursor_mz", "ionmode", "smiles", "inchikey", "inchi", "name", |
| 14 | + "charge", "instrument_type", "adduct", "collision_energy", |
| 15 | +] |
| 16 | + |
| 17 | + |
| 18 | +def _data_dir() -> Path: |
| 19 | + return Path(__file__).parent / "test_data" |
| 20 | + |
| 21 | + |
| 22 | +def _paths(): |
| 23 | + data_dir = _data_dir() |
| 24 | + spectra_file = data_dir / "10_spectra.mgf" |
| 25 | + model_path = data_dir / "ms2deepscore_testmodel_v1.pt" |
| 26 | + assert spectra_file.exists(), f"Missing test spectra file: {spectra_file}" |
| 27 | + assert model_path.exists(), f"Missing test model file: {model_path}" |
| 28 | + return spectra_file, model_path |
| 29 | + |
| 30 | + |
| 31 | +@pytest.mark.filterwarnings("ignore::UserWarning") |
| 32 | +def test_create_and_load_library(tmp_path: Path): |
| 33 | + spectra_file, model_path = _paths() |
| 34 | + |
| 35 | + # ---------- Create ---------- |
| 36 | + outdir = tmp_path / "results" |
| 37 | + outdir.mkdir(parents=True, exist_ok=True) |
| 38 | + |
| 39 | + lib = create_new_library( |
| 40 | + spectra_files=[str(spectra_file)], |
| 41 | + annotation_files=[], # currently unused in the workflow |
| 42 | + output_folder=str(outdir), |
| 43 | + model_path=str(model_path), |
| 44 | + # Keep the index small/fast for CI: |
| 45 | + embedding_index_params={"M": 8, "ef_construction": 50, "post_init_ef": 50, "batch_rows": 100_000}, |
| 46 | + ) |
| 47 | + # basic sanity |
| 48 | + assert lib is not None |
| 49 | + assert isinstance(lib.db, MS2QueryDatabase) |
| 50 | + |
| 51 | + # SQLite must exist |
| 52 | + db_path = outdir / "ms2query_library.sqlite" |
| 53 | + assert db_path.exists(), "MS2Query database file was not created." |
| 54 | + |
| 55 | + # ---------- DB content checks ---------- |
| 56 | + ms2query_db = lib.db |
| 57 | + |
| 58 | + # Metadata query by compound id (expected shape from your snippet) |
| 59 | + df_meta = ms2query_db.metadata_by_comp_id(TEST_COMP_ID) |
| 60 | + assert tuple(df_meta.shape) == EXPECTED_METADATA_SHAPE |
| 61 | + |
| 62 | + # Metadata fields presence both in db wrapper and in returned dataframe |
| 63 | + md_fields = ms2query_db.metadata_fields |
| 64 | + for f in EXPECTED_METADATA_FIELDS: |
| 65 | + assert f in md_fields, f"Metadata field '{f}' is missing in the database." |
| 66 | + assert f in df_meta.columns, f"Metadata field '{f}' missing in metadata_by_comp_id result." |
| 67 | + |
| 68 | + # ---------- Embedding index artifacts ---------- |
| 69 | + # The workflow saves with a base prefix (no extension). NMSLIB writes two files: <prefix> and <prefix>.dat |
| 70 | + emb_prefix = outdir / "embedding_index" |
| 71 | + pair_exists = emb_prefix.exists() and (emb_prefix.with_suffix(".dat")).exists() |
| 72 | + assert pair_exists, "EmbeddingIndex files not found." |
| 73 | + |
| 74 | + # ---------- Load ---------- |
| 75 | + lib2 = load_created_library(str(outdir)) |
| 76 | + assert lib2 is not None |
| 77 | + assert isinstance(lib2.db, MS2QueryDatabase) |
| 78 | + |
| 79 | + # ---------- Tiny embedding-index query ---------- |
| 80 | + # Take one spectrum from the same file and try a small top-k query |
| 81 | + spectra = list(load_spectra(spectra_file)) |
| 82 | + assert len(spectra) > 0, "No spectra parsed from test file." |
| 83 | + |
| 84 | + # queries should return non-empty hits with spec_ids |
| 85 | + results = lib2.query_embedding_index(spectra[0], k=3, return_dataframe=False) |
| 86 | + assert isinstance(results, list) |
| 87 | + assert len(results) == 1 |
| 88 | + |
| 89 | + hits = results[0] |
| 90 | + assert len(hits) == 3, "k was set to 3, but returned different number of hits." |
| 91 | + assert set(hits[0].keys()) >= {"rank", "spec_id", "score"} |
| 92 | + assert isinstance(hits[0]["spec_id"], (str, np.str_)) |
0 commit comments