Skip to content

Commit f8a09b9

Browse files
HFooladiclaude
andcommitted
feat(cli): add standalone featurize command for pre-computing features
Add a new `themap featurize` CLI command that allows users to compute and cache molecular features without performing distance computation. Features: - Support for single file or directory input - Multiple featurizers can be specified (-f ecfp -f maccs) - Fold selection (train/test/valid/all) - Cache directory configuration - Force recomputation option - Progress bar with success/failure tracking Usage: themap featurize datasets/ -f ecfp themap featurize datasets/train/CHEMBL123.jsonl.gz -f maccs Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 62dca09 commit f8a09b9

File tree

1 file changed

+189
-1
lines changed

1 file changed

+189
-1
lines changed

themap/cli.py

Lines changed: 189 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@
99
themap run config.yaml -o output/ # Run with custom output
1010
themap init # Create sample config file
1111
themap convert input.csv CHEMBL123 # Convert CSV to JSONL.GZ
12+
themap featurize datasets/ -f ecfp # Featurize datasets (no distance computation)
1213
themap list-featurizers # List available featurizers
1314
"""
1415

1516
from pathlib import Path
16-
from typing import Optional
17+
from typing import List, Optional, Tuple
1718

1819
import click
1920

@@ -317,6 +318,193 @@ def info(data_dir: str) -> None:
317318
click.echo(f"\nProteins: {stats['proteins']['count']} FASTA files")
318319

319320

321+
@cli.command()
322+
@click.argument("data_path", type=click.Path(exists=True))
323+
@click.option(
324+
"--featurizer",
325+
"-f",
326+
multiple=True,
327+
default=["ecfp"],
328+
help="Featurizer(s) to use (can be specified multiple times)",
329+
)
330+
@click.option("--cache-dir", "-c", default="feature_cache", help="Directory to cache features")
331+
@click.option(
332+
"--fold",
333+
type=click.Choice(["train", "test", "valid", "all"]),
334+
default="all",
335+
help="Which fold(s) to featurize",
336+
)
337+
@click.option("--n-jobs", "-j", default=1, help="Number of parallel jobs for featurization")
338+
@click.option("--force", is_flag=True, help="Recompute features even if cached")
339+
@click.pass_context
340+
def featurize(
341+
ctx: click.Context,
342+
data_path: str,
343+
featurizer: Tuple[str, ...],
344+
cache_dir: str,
345+
fold: str,
346+
n_jobs: int,
347+
force: bool,
348+
) -> None:
349+
"""Compute and cache molecular features without distance computation.
350+
351+
DATA_PATH can be either:
352+
- A directory with train/test/valid folders containing datasets
353+
- A single dataset file (.jsonl.gz or .csv)
354+
355+
Features are cached to disk and can be reused by other commands.
356+
357+
Examples:
358+
# Featurize all datasets in a directory with ECFP
359+
themap featurize datasets/ -f ecfp
360+
361+
# Featurize with multiple featurizers
362+
themap featurize datasets/ -f ecfp -f maccs -f desc2D
363+
364+
# Featurize only training data
365+
themap featurize datasets/ -f ecfp --fold train
366+
367+
# Featurize a single file
368+
themap featurize datasets/train/CHEMBL123.jsonl.gz -f ecfp
369+
370+
# Force recomputation (ignore cache)
371+
themap featurize datasets/ -f ecfp --force
372+
373+
# Custom cache directory
374+
themap featurize datasets/ -f ecfp --cache-dir my_cache/
375+
"""
376+
from .data.loader import DatasetLoader
377+
from .data.molecule_dataset import MoleculeDataset
378+
from .pipeline.featurization import FeaturizationPipeline
379+
380+
data_path_obj = Path(data_path)
381+
cache_path = Path(cache_dir)
382+
featurizer_list: List[str] = list(featurizer)
383+
384+
click.echo(f"Featurizing data from: {data_path}")
385+
click.echo(f"Featurizers: {', '.join(featurizer_list)}")
386+
click.echo(f"Cache directory: {cache_path}")
387+
388+
try:
389+
# Determine if input is a file or directory
390+
if data_path_obj.is_file():
391+
# Single file mode
392+
click.echo(f"\nProcessing single file: {data_path_obj.name}")
393+
task_id = data_path_obj.stem.replace(".jsonl", "")
394+
395+
dataset = MoleculeDataset.load_from_file(data_path_obj)
396+
datasets = [dataset]
397+
dataset_names = [task_id]
398+
399+
click.echo(f" Loaded {len(dataset)} molecules")
400+
else:
401+
# Directory mode
402+
loader = DatasetLoader(data_path_obj)
403+
stats = loader.get_statistics()
404+
405+
click.echo(f"\nDataset directory: {stats['data_dir']}")
406+
407+
datasets = []
408+
dataset_names = []
409+
410+
# Determine which folds to process
411+
folds_to_process = ["train", "test", "valid"] if fold == "all" else [fold]
412+
413+
for fold_name in folds_to_process:
414+
if fold_name not in stats.get("folds", {}):
415+
continue
416+
417+
fold_stats = stats["folds"][fold_name]
418+
click.echo(f"\n{fold_name.capitalize()} fold: {fold_stats['task_count']} tasks")
419+
420+
fold_datasets = loader.load_datasets(fold_name)
421+
for task_id, ds in fold_datasets.items():
422+
datasets.append(ds)
423+
dataset_names.append(f"{fold_name}_{task_id}")
424+
425+
if not datasets:
426+
click.echo("No datasets found to featurize.", err=True)
427+
raise SystemExit(1)
428+
429+
click.echo(f"\nTotal datasets to featurize: {len(datasets)}")
430+
431+
# Process each featurizer
432+
for feat_name in featurizer_list:
433+
click.echo(f"\n{'=' * 50}")
434+
click.echo(f"Featurizer: {feat_name}")
435+
click.echo(f"{'=' * 50}")
436+
437+
pipeline = FeaturizationPipeline(
438+
cache_dir=cache_path,
439+
molecule_featurizer=feat_name,
440+
)
441+
442+
# Check cache status
443+
if not force:
444+
cached_count = 0
445+
for ds in datasets:
446+
if pipeline.store.has_molecule_features(ds.task_id, feat_name):
447+
cached_count += 1
448+
449+
if cached_count > 0:
450+
click.echo(f" Found {cached_count}/{len(datasets)} datasets already cached")
451+
if cached_count == len(datasets):
452+
click.echo(" All datasets already cached. Use --force to recompute.")
453+
continue
454+
455+
# Clear cache if force flag is set
456+
if force:
457+
click.echo(" Clearing existing cache...")
458+
pipeline.store.clear_cache(feat_name)
459+
460+
# Featurize all datasets
461+
click.echo(f" Computing features for {len(datasets)} datasets...")
462+
463+
with click.progressbar(
464+
zip(datasets, dataset_names),
465+
length=len(datasets),
466+
label=" Featurizing",
467+
) as bar:
468+
success_count = 0
469+
fail_count = 0
470+
471+
for ds, name in bar:
472+
try:
473+
# Check if already cached
474+
if not force and pipeline.store.has_molecule_features(ds.task_id, feat_name):
475+
success_count += 1
476+
continue
477+
478+
# Featurize
479+
pipeline.featurize_all_datasets([ds])
480+
success_count += 1
481+
except Exception as e:
482+
fail_count += 1
483+
if ctx.obj.get("verbose"):
484+
click.echo(f"\n Error featurizing {name}: {e}", err=True)
485+
486+
click.echo(f" Completed: {success_count} succeeded, {fail_count} failed")
487+
488+
# Show cache location
489+
cache_subdir = cache_path / "molecules" / feat_name
490+
if cache_subdir.exists():
491+
n_cached = len(list(cache_subdir.glob("*.npz")))
492+
click.echo(f" Cached features: {cache_subdir} ({n_cached} files)")
493+
494+
click.echo("\nFeaturization complete!")
495+
click.echo(f"Features cached at: {cache_path}")
496+
click.echo("\nTo use cached features in distance computation:")
497+
click.echo(f" themap quick {data_path} --featurizer {featurizer_list[0]}")
498+
499+
except Exception as e:
500+
click.echo(f"Error: {e}", err=True)
501+
if ctx.obj.get("verbose"):
502+
import traceback
503+
504+
traceback.print_exc()
505+
raise SystemExit(1)
506+
507+
320508
def main() -> None:
321509
"""Entry point for CLI."""
322510
cli()

0 commit comments

Comments
 (0)