replace dataframe addition method + add tests

florian-huber · florian-huber · commit 63a429b926df · 2025-11-14T13:16:43.000+01:00
diff --git a/ms2query/database/compound_database.py b/ms2query/database/compound_database.py
@@ -321,68 +321,137 @@ def upsert_many(self, rows: Iterable[Dict[str, Any]]) -> List[str]:
                 cur.executemany(UPSERT_SQL.format(table=self.table), payloads)
         return comp_ids
 
-    def upsert_metadata_from_dataframe(
+    
+    def overwrite_metadata_from_dataframe(
         self,
         df: pd.DataFrame,
         *,
-        colmap: Optional[Dict[str, str]] = None,
+        column_mapper: Optional[Dict[str, str]] = None,
+        chunksize: int = 50_000,
         staging_table: str = "_staging_compounds",
     ) -> dict:
-        """Load/update metadata (no fingerprints) via a staging table."""
+        """
+        Fast initialize/replace of the compounds table from a wide DataFrame.
+    
+        Parameters
+        ----------
+        df : pd.DataFrame
+            DataFrame containing compound metadata.
+        column_mapper : Optional[Dict[str, str]], optional
+            Mapping of DataFrame columns to expected compound fields, by default None.
+        chunksize : int, optional
+            Number of rows per chunk when writing to the database, by default 50_000.
+
+        Pass `column_mapper` to map your DataFrame columns to the expected names.
+        Supported mapping keys (values are your df column names):
+            - 'comp_id'                (14-char key; if present, used as-is)
+            - 'inchikey'               (full key; used to derive comp_id if no comp_id provided)
+            - 'smiles'
+            - 'inchi'
+            - 'classyfire_class'
+            - 'classyfire_superclass'
+    
+        If you don't pass a mapper, this will auto-detect common aliases for the 14-char key:
+            'nchikey', 'inchikey14', 'inchikey_14', 'ik14', 'comp_id'
+        """
         if df is None or df.empty:
-            return {"rows": 0, "valid": 0, "inserted_or_updated": 0, "skipped_no_or_bad_inchikey": 0}
-
-        # Column mapping
+            return {"rows": 0, "valid": 0, "written": 0, "skipped": 0}
+    
+        # ----- resolve columns (mapper-aware with sensible defaults) -----
         default_map = {
+            "comp_id": None,  # optional (14-char key)
             "inchikey": "inchikey",
             "smiles": "smiles",
             "inchi": "inchi",
             "classyfire_class": "classyfire_class",
             "classyfire_superclass": "classyfire_superclass",
         }
-        cmap = {k: (colmap.get(k) if colmap and k in colmap else v) for k, v in default_map.items()}
-        if cmap["inchikey"] not in df.columns:
-            raise ValueError("DataFrame must contain an 'inchikey' column (or provide colmap).")
-
-        # Build compact frame
-        work = pd.DataFrame({"inchikey": df[cmap["inchikey"]].astype(str)})
-        work["comp_id"] = work["inchikey"].map(inchikey14_from_full)
-
-        valid_mask = work["comp_id"].notna() & work["comp_id"].astype(str).str.len().eq(14)
-        skipped = int((~valid_mask).sum())
-        work = work.loc[valid_mask, ["comp_id", "inchikey"]].copy()
-
+        cmap = {k: (column_mapper.get(k) if column_mapper and k in column_mapper else v)
+                for k, v in default_map.items()}
+    
+        # Auto-detect a 14-char key if no mapping was provided for comp_id
+        def _first_present(cols: list[str]) -> Optional[str]:
+            for c in cols:
+                if c in df.columns:
+                    return c
+            return None
+    
+        if cmap["comp_id"] is None:
+            cmap["comp_id"] = _first_present(["nchikey", "inchikey14", "inchikey_14", "ik14", "comp_id"])
+    
+        # We need either a 14-char key or a full inchikey (possibly via mapping)
+        has_comp14 = cmap["comp_id"] is not None and cmap["comp_id"] in df.columns
+        has_fullik = cmap["inchikey"] is not None and cmap["inchikey"] in df.columns
+        if not has_comp14 and not has_fullik:
+            raise ValueError(
+                "DataFrame must contain either a 14-char key "
+                "(map it via column_mapper['comp_id']) or a full 'inchikey' "
+                "(map it via column_mapper['inchikey'])."
+            )
+    
+        # ----- build minimal working frame -----
+        work = pd.DataFrame()
+    
+        # comp_id (14-char)
+        if has_comp14:
+            work["comp_id"] = df[cmap["comp_id"]].astype(str).str.strip()
+        else:
+            # derive from full inchikey
+            full = df[cmap["inchikey"]].astype(str).str.strip()
+            work["comp_id"] = full.map(inchikey14_from_full)
+    
+        # inchikey (full) if present
+        work["inchikey"] = df[cmap["inchikey"]].astype(str).str.strip() if has_fullik else None
+    
+        # optional metadata (mapper-aware)
         for k in ("smiles", "inchi", "classyfire_class", "classyfire_superclass"):
             src = cmap[k]
-            work[k] = df[src] if src in df.columns else None
-
-        work = work.drop_duplicates(subset=["comp_id"], keep="last").reset_index(drop=True)
-
-        # Stage + upsert
-        work.to_sql(staging_table, self._conn, if_exists="replace", index=False)
+            work[k] = df[src] if (src is not None and src in df.columns) else None
+    
+        # ----- validate / deduplicate -----
+        comp = work["comp_id"].astype(str).str.strip()
+        valid_mask = comp.str.len().eq(14) & comp.ne("")
+        skipped = int((~valid_mask).sum())
+    
+        work = (work.loc[valid_mask, ["comp_id", "inchikey", "smiles", "inchi",
+                                      "classyfire_class", "classyfire_superclass"]]
+                    .drop_duplicates(subset=["comp_id"], keep="last")
+                    .reset_index(drop=True))
+    
+        if work.empty:
+            return {"rows": int(len(df)), "valid": 0, "written": 0, "skipped": int(skipped)}
+    
+        # ----- bulk load: staging -> recreate main table (fast) -----
+        cur = self._conn.cursor()
+        # speed PRAGMAs during load
+        cur.execute("PRAGMA synchronous=OFF")
+        cur.execute("PRAGMA temp_store=MEMORY")
+        cur.execute("PRAGMA cache_size=-200000")
+    
+        # 1) write to staging with big chunks & multi-row inserts
+        work.to_sql(staging_table, self._conn, if_exists="replace", index=False,
+                    chunksize=chunksize, method="multi")
+    
+        # 2) atomically recreate the target table with schema + copy from staging
         with self._tx() as cur:
+            cur.execute(f"DROP TABLE IF EXISTS {self.table}")
+            cur.executescript(SCHEMA_SQL.format(table=self.table))
             cur.execute(f"""
                 INSERT INTO {self.table} (
-                    comp_id, smiles, inchi, inchikey, classyfire_class, classyfire_superclass
+                    comp_id, smiles, inchi, inchikey,
+                    classyfire_class, classyfire_superclass
                 )
-                SELECT comp_id, smiles, inchi, inchikey, classyfire_class, classyfire_superclass
+                SELECT comp_id, smiles, inchi, inchikey,
+                       classyfire_class, classyfire_superclass
                 FROM {staging_table}
-                ON CONFLICT(comp_id) DO UPDATE SET
-                    smiles                = COALESCE(excluded.smiles,                {self.table}.smiles),
-                    inchi                 = COALESCE(excluded.inchi,                 {self.table}.inchi),
-                    inchikey              = COALESCE(excluded.inchikey,              {self.table}.inchikey),
-                    classyfire_class      = COALESCE(excluded.classyfire_class,      {self.table}.classyfire_class),
-                    classyfire_superclass = COALESCE(excluded.classyfire_superclass, {self.table}.classyfire_superclass)
             """)
-            affected = cur.rowcount or 0
+            written = cur.rowcount or len(work)
             cur.execute(f"DROP TABLE IF EXISTS {staging_table}")
-
-        return {
-            "rows": int(len(df)),
-            "valid": int(len(work)),
-            "inserted_or_updated": int(affected),
-            "skipped_no_or_bad_inchikey": int(skipped),
-        }
+    
+        # Make sure indexes & settings table exist (idempotent)
+        self._ensure_schema_and_settings()
+    
+        return {"rows": int(len(df)), "valid": int(len(work)), "written": int(written), "skipped": int(skipped)}
 
     def compute_fingerprints(
         self,
@@ -447,7 +516,8 @@ def get_fingerprints(self, comp_id_list: List[str]):
         for cid in comp_id_list:
             r = next((row for row in rows if row["comp_id"] == cid), None)
             if r is None:
-                out.append(None); continue
+                out.append(None)
+                continue
             dense_blob  = r["fingerprint_dense"] or b""
             bits_blob   = r["fingerprint_bits"] or b""
             counts_blob = r["fingerprint_counts"] or b""
diff --git a/tests/test_compound_database.py b/tests/test_compound_database.py
@@ -1,8 +1,9 @@
 import sqlite3
 from pathlib import Path
 import numpy as np
+import pandas as pd
 import pytest
-from ms2query.data_processing import compute_morgan_fingerprints
+from ms2query.data_processing import compute_morgan_fingerprints, inchikey14_from_full
 from ms2query.database.compound_database import (
     CompoundDatabase,
 )
@@ -64,7 +65,6 @@ def test_compute_fingerprints_contract():
             bits, counts = fp
             assert isinstance(bits, np.ndarray) and bits.dtype == np.uint32
             assert isinstance(counts, np.ndarray)
-            # counts are usually integer-like (could be float if you later scale)
             assert counts.ndim == 1
 
 # -------------------------
@@ -187,4 +187,114 @@ def test_compute_fingerprints_method(count, sparse):
     else:
         assert np.allclose(fps_directly[0], fps_after[0])
     cdb.close()
-    
+
+
+def test_overwrite_metadata_from_dataframe_basic_and_mapping(tmp_path):
+    db_path = tmp_path / "compounds.sqlite"
+    cdb = CompoundDatabase(str(db_path))
+
+    # Wide DF with aliases + extras; includes:
+    # - valid 14-char keys via 'nchikey'
+    # - one invalid key (too short) -> skipped
+    # - one duplicate comp_id -> keep last
+    df = pd.DataFrame({
+        "nchikey": ["AAAQFGUYHFJNHI", "AABFWJDLCCDJJN", "SHORTKEY", "AABFWJDLCCDJJN"],
+        "smiles":  ["S1", "S2", "S_bad", "S2_override"],
+        "cf_class": ["C1", "C2", "C_bad", "C2_override"],
+        "cf_superclass": ["SC1", "SC2", "SC_bad", "SC2_override"],
+        "mass": [423.146, 324.126, 0.0, 999.0],  # extra column to be ignored
+    })
+
+    stats = cdb.overwrite_metadata_from_dataframe(
+        df,
+        column_mapper={  # map aliases -> expected names
+            "comp_id": "nchikey",
+            "smiles": "smiles",
+            "classyfire_class": "cf_class",
+            "classyfire_superclass": "cf_superclass",
+        }
+    )
+
+    # Rows: 4 incoming, 1 invalid (SHORTKEY) -> skipped=1
+    # Valid comp_ids: AAAQFGUYHFJNHI, AABFWJDLCCDJJN (duplicate -> keep last) => written=2
+    assert stats["rows"] == 4
+    assert stats["skipped"] == 1
+    assert stats["valid"] == 2
+    assert stats["written"] == 2
+
+    # Check DB content
+    df_db = pd.read_sql_query("SELECT comp_id, smiles, classyfire_class, classyfire_superclass, inchikey, inchi FROM compounds", cdb._conn)
+    assert set(df_db["comp_id"]) == {"AAAQFGUYHFJNHI", "AABFWJDLCCDJJN"}
+
+    # Row without full inchikey provided -> stored as NULL. Inchi not provided -> NULL
+    assert df_db.loc[df_db["comp_id"] == "AAAQFGUYHFJNHI", "inchikey"].iloc[0] in (None, np.nan, "")
+    assert df_db.loc[df_db["comp_id"] == "AAAQFGUYHFJNHI", "inchi"].iloc[0] in (None, np.nan, "")
+
+    # “keep last” behavior for duplicate comp_id
+    r = df_db.set_index("comp_id").loc["AABFWJDLCCDJJN"]
+    assert r["smiles"] == "S2_override"
+    assert r["classyfire_class"] == "C2_override"
+    assert r["classyfire_superclass"] == "SC2_override"
+
+    # Settings table is intact and readable
+    settings = cdb.get_fingerprint_settings()
+    assert {"nbits", "radius", "sparse", "count", "dtype"} <= set(settings.keys())
+
+    cdb.close()
+
+
+def test_overwrite_metadata_from_dataframe_derive_comp_id_and_true_replace(tmp_path):
+    db_path = tmp_path / "compounds.sqlite"
+    cdb = CompoundDatabase(str(db_path))
+
+    # First load: only full InChIKeys (custom column name), comp_id must be derived
+    df1 = pd.DataFrame({
+        "IK_FULL": [
+            "BQJCRHHNABKAKU-KBQPJGBKSA-N",
+            "BSYNRYMUTXBXSQ-UHFFFAOYSA-N",
+        ],
+        "smiles": ["CCO", "O=C=O"],
+        "inchi": ["InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3", "InChI=1S/CO2/c2-1-3"],
+        "cf_class": ["Alcohols", "Carbon oxides"],
+        "cf_superclass": ["Organooxygen compounds", "Inorganic compounds"],
+    })
+
+    stats1 = cdb.overwrite_metadata_from_dataframe(
+        df1,
+        column_mapper={
+            "inchikey": "IK_FULL",                # derive comp_id from full IK
+            "smiles": "smiles",
+            "inchi": "inchi",
+            "classyfire_class": "cf_class",
+            "classyfire_superclass": "cf_superclass",
+        }
+    )
+    assert stats1["written"] == 2
+    df_db1 = pd.read_sql_query("SELECT comp_id, inchikey, smiles FROM compounds ORDER BY comp_id", cdb._conn)
+    # comp_id equals inchikey14_from_full(inchikey)
+    for _, row in df_db1.iterrows():
+        assert row["comp_id"] == inchikey14_from_full(row["inchikey"])
+
+    # Second load: replace with a different set -> previous rows must disappear
+    df2 = pd.DataFrame({
+        "IK_FULL": ["AAOVKJBEBIDNHE-UHFFFAOYSA-N"],
+        "smiles": ["CC(=O)O"],
+        "cf_class": ["Carboxylic acids"],
+        "cf_superclass": ["Organooxygen compounds"],
+    })
+    stats2 = cdb.overwrite_metadata_from_dataframe(
+        df2,
+        column_mapper={
+            "inchikey": "IK_FULL",
+            "smiles": "smiles",
+            "classyfire_class": "cf_class",
+            "classyfire_superclass": "cf_superclass",
+        }
+    )
+    assert stats2["written"] == 1
+    df_db2 = pd.read_sql_query("SELECT comp_id, inchikey, smiles FROM compounds", cdb._conn)
+    assert len(df_db2) == 1
+    assert df_db2.iloc[0]["comp_id"] == inchikey14_from_full(df_db2.iloc[0]["inchikey"])
+    assert set(df_db2["smiles"]) == {"CC(=O)O"}  # previous rows gone (true replace)
+
+    cdb.close()