@@ -321,68 +321,137 @@ def upsert_many(self, rows: Iterable[Dict[str, Any]]) -> List[str]:
321321 cur .executemany (UPSERT_SQL .format (table = self .table ), payloads )
322322 return comp_ids
323323
324- def upsert_metadata_from_dataframe (
324+
325+ def overwrite_metadata_from_dataframe (
325326 self ,
326327 df : pd .DataFrame ,
327328 * ,
328- colmap : Optional [Dict [str , str ]] = None ,
329+ column_mapper : Optional [Dict [str , str ]] = None ,
330+ chunksize : int = 50_000 ,
329331 staging_table : str = "_staging_compounds" ,
330332 ) -> dict :
331- """Load/update metadata (no fingerprints) via a staging table."""
333+ """
334+ Fast initialize/replace of the compounds table from a wide DataFrame.
335+
336+ Parameters
337+ ----------
338+ df : pd.DataFrame
339+ DataFrame containing compound metadata.
340+ column_mapper : Optional[Dict[str, str]], optional
341+ Mapping of DataFrame columns to expected compound fields, by default None.
342+ chunksize : int, optional
343+ Number of rows per chunk when writing to the database, by default 50_000.
344+
345+ Pass `column_mapper` to map your DataFrame columns to the expected names.
346+ Supported mapping keys (values are your df column names):
347+ - 'comp_id' (14-char key; if present, used as-is)
348+ - 'inchikey' (full key; used to derive comp_id if no comp_id provided)
349+ - 'smiles'
350+ - 'inchi'
351+ - 'classyfire_class'
352+ - 'classyfire_superclass'
353+
354+ If you don't pass a mapper, this will auto-detect common aliases for the 14-char key:
355+ 'nchikey', 'inchikey14', 'inchikey_14', 'ik14', 'comp_id'
356+ """
332357 if df is None or df .empty :
333- return {"rows" : 0 , "valid" : 0 , "inserted_or_updated " : 0 , "skipped_no_or_bad_inchikey " : 0 }
334-
335- # Column mapping
358+ return {"rows" : 0 , "valid" : 0 , "written " : 0 , "skipped " : 0 }
359+
360+ # ----- resolve columns (mapper-aware with sensible defaults) -----
336361 default_map = {
362+ "comp_id" : None , # optional (14-char key)
337363 "inchikey" : "inchikey" ,
338364 "smiles" : "smiles" ,
339365 "inchi" : "inchi" ,
340366 "classyfire_class" : "classyfire_class" ,
341367 "classyfire_superclass" : "classyfire_superclass" ,
342368 }
343- cmap = {k : (colmap .get (k ) if colmap and k in colmap else v ) for k , v in default_map .items ()}
344- if cmap ["inchikey" ] not in df .columns :
345- raise ValueError ("DataFrame must contain an 'inchikey' column (or provide colmap)." )
346-
347- # Build compact frame
348- work = pd .DataFrame ({"inchikey" : df [cmap ["inchikey" ]].astype (str )})
349- work ["comp_id" ] = work ["inchikey" ].map (inchikey14_from_full )
350-
351- valid_mask = work ["comp_id" ].notna () & work ["comp_id" ].astype (str ).str .len ().eq (14 )
352- skipped = int ((~ valid_mask ).sum ())
353- work = work .loc [valid_mask , ["comp_id" , "inchikey" ]].copy ()
354-
369+ cmap = {k : (column_mapper .get (k ) if column_mapper and k in column_mapper else v )
370+ for k , v in default_map .items ()}
371+
372+ # Auto-detect a 14-char key if no mapping was provided for comp_id
373+ def _first_present (cols : list [str ]) -> Optional [str ]:
374+ for c in cols :
375+ if c in df .columns :
376+ return c
377+ return None
378+
379+ if cmap ["comp_id" ] is None :
380+ cmap ["comp_id" ] = _first_present (["nchikey" , "inchikey14" , "inchikey_14" , "ik14" , "comp_id" ])
381+
382+ # We need either a 14-char key or a full inchikey (possibly via mapping)
383+ has_comp14 = cmap ["comp_id" ] is not None and cmap ["comp_id" ] in df .columns
384+ has_fullik = cmap ["inchikey" ] is not None and cmap ["inchikey" ] in df .columns
385+ if not has_comp14 and not has_fullik :
386+ raise ValueError (
387+ "DataFrame must contain either a 14-char key "
388+ "(map it via column_mapper['comp_id']) or a full 'inchikey' "
389+ "(map it via column_mapper['inchikey'])."
390+ )
391+
392+ # ----- build minimal working frame -----
393+ work = pd .DataFrame ()
394+
395+ # comp_id (14-char)
396+ if has_comp14 :
397+ work ["comp_id" ] = df [cmap ["comp_id" ]].astype (str ).str .strip ()
398+ else :
399+ # derive from full inchikey
400+ full = df [cmap ["inchikey" ]].astype (str ).str .strip ()
401+ work ["comp_id" ] = full .map (inchikey14_from_full )
402+
403+ # inchikey (full) if present
404+ work ["inchikey" ] = df [cmap ["inchikey" ]].astype (str ).str .strip () if has_fullik else None
405+
406+ # optional metadata (mapper-aware)
355407 for k in ("smiles" , "inchi" , "classyfire_class" , "classyfire_superclass" ):
356408 src = cmap [k ]
357- work [k ] = df [src ] if src in df .columns else None
358-
359- work = work .drop_duplicates (subset = ["comp_id" ], keep = "last" ).reset_index (drop = True )
360-
361- # Stage + upsert
362- work .to_sql (staging_table , self ._conn , if_exists = "replace" , index = False )
409+ work [k ] = df [src ] if (src is not None and src in df .columns ) else None
410+
411+ # ----- validate / deduplicate -----
412+ comp = work ["comp_id" ].astype (str ).str .strip ()
413+ valid_mask = comp .str .len ().eq (14 ) & comp .ne ("" )
414+ skipped = int ((~ valid_mask ).sum ())
415+
416+ work = (work .loc [valid_mask , ["comp_id" , "inchikey" , "smiles" , "inchi" ,
417+ "classyfire_class" , "classyfire_superclass" ]]
418+ .drop_duplicates (subset = ["comp_id" ], keep = "last" )
419+ .reset_index (drop = True ))
420+
421+ if work .empty :
422+ return {"rows" : int (len (df )), "valid" : 0 , "written" : 0 , "skipped" : int (skipped )}
423+
424+ # ----- bulk load: staging -> recreate main table (fast) -----
425+ cur = self ._conn .cursor ()
426+ # speed PRAGMAs during load
427+ cur .execute ("PRAGMA synchronous=OFF" )
428+ cur .execute ("PRAGMA temp_store=MEMORY" )
429+ cur .execute ("PRAGMA cache_size=-200000" )
430+
431+ # 1) write to staging with big chunks & multi-row inserts
432+ work .to_sql (staging_table , self ._conn , if_exists = "replace" , index = False ,
433+ chunksize = chunksize , method = "multi" )
434+
435+ # 2) atomically recreate the target table with schema + copy from staging
363436 with self ._tx () as cur :
437+ cur .execute (f"DROP TABLE IF EXISTS { self .table } " )
438+ cur .executescript (SCHEMA_SQL .format (table = self .table ))
364439 cur .execute (f"""
365440 INSERT INTO { self .table } (
366- comp_id, smiles, inchi, inchikey, classyfire_class, classyfire_superclass
441+ comp_id, smiles, inchi, inchikey,
442+ classyfire_class, classyfire_superclass
367443 )
368- SELECT comp_id, smiles, inchi, inchikey, classyfire_class, classyfire_superclass
444+ SELECT comp_id, smiles, inchi, inchikey,
445+ classyfire_class, classyfire_superclass
369446 FROM { staging_table }
370- ON CONFLICT(comp_id) DO UPDATE SET
371- smiles = COALESCE(excluded.smiles, { self .table } .smiles),
372- inchi = COALESCE(excluded.inchi, { self .table } .inchi),
373- inchikey = COALESCE(excluded.inchikey, { self .table } .inchikey),
374- classyfire_class = COALESCE(excluded.classyfire_class, { self .table } .classyfire_class),
375- classyfire_superclass = COALESCE(excluded.classyfire_superclass, { self .table } .classyfire_superclass)
376447 """ )
377- affected = cur .rowcount or 0
448+ written = cur .rowcount or len ( work )
378449 cur .execute (f"DROP TABLE IF EXISTS { staging_table } " )
379-
380- return {
381- "rows" : int (len (df )),
382- "valid" : int (len (work )),
383- "inserted_or_updated" : int (affected ),
384- "skipped_no_or_bad_inchikey" : int (skipped ),
385- }
450+
451+ # Make sure indexes & settings table exist (idempotent)
452+ self ._ensure_schema_and_settings ()
453+
454+ return {"rows" : int (len (df )), "valid" : int (len (work )), "written" : int (written ), "skipped" : int (skipped )}
386455
387456 def compute_fingerprints (
388457 self ,
@@ -447,7 +516,8 @@ def get_fingerprints(self, comp_id_list: List[str]):
447516 for cid in comp_id_list :
448517 r = next ((row for row in rows if row ["comp_id" ] == cid ), None )
449518 if r is None :
450- out .append (None ); continue
519+ out .append (None )
520+ continue
451521 dense_blob = r ["fingerprint_dense" ] or b""
452522 bits_blob = r ["fingerprint_bits" ] or b""
453523 counts_blob = r ["fingerprint_counts" ] or b""
0 commit comments