|
9 | 9 | themap run config.yaml -o output/ # Run with custom output |
10 | 10 | themap init # Create sample config file |
11 | 11 | themap convert input.csv CHEMBL123 # Convert CSV to JSONL.GZ |
| 12 | + themap featurize datasets/ -f ecfp # Featurize datasets (no distance computation) |
12 | 13 | themap list-featurizers # List available featurizers |
13 | 14 | """ |
14 | 15 |
|
15 | 16 | from pathlib import Path |
16 | | -from typing import Optional |
| 17 | +from typing import List, Optional, Tuple |
17 | 18 |
|
18 | 19 | import click |
19 | 20 |
|
@@ -317,6 +318,193 @@ def info(data_dir: str) -> None: |
317 | 318 | click.echo(f"\nProteins: {stats['proteins']['count']} FASTA files") |
318 | 319 |
|
319 | 320 |
|
| 321 | +@cli.command() |
| 322 | +@click.argument("data_path", type=click.Path(exists=True)) |
| 323 | +@click.option( |
| 324 | + "--featurizer", |
| 325 | + "-f", |
| 326 | + multiple=True, |
| 327 | + default=["ecfp"], |
| 328 | + help="Featurizer(s) to use (can be specified multiple times)", |
| 329 | +) |
| 330 | +@click.option("--cache-dir", "-c", default="feature_cache", help="Directory to cache features") |
| 331 | +@click.option( |
| 332 | + "--fold", |
| 333 | + type=click.Choice(["train", "test", "valid", "all"]), |
| 334 | + default="all", |
| 335 | + help="Which fold(s) to featurize", |
| 336 | +) |
| 337 | +@click.option("--n-jobs", "-j", default=1, help="Number of parallel jobs for featurization") |
| 338 | +@click.option("--force", is_flag=True, help="Recompute features even if cached") |
| 339 | +@click.pass_context |
| 340 | +def featurize( |
| 341 | + ctx: click.Context, |
| 342 | + data_path: str, |
| 343 | + featurizer: Tuple[str, ...], |
| 344 | + cache_dir: str, |
| 345 | + fold: str, |
| 346 | + n_jobs: int, |
| 347 | + force: bool, |
| 348 | +) -> None: |
| 349 | + """Compute and cache molecular features without distance computation. |
| 350 | +
|
| 351 | + DATA_PATH can be either: |
| 352 | + - A directory with train/test/valid folders containing datasets |
| 353 | + - A single dataset file (.jsonl.gz or .csv) |
| 354 | +
|
| 355 | + Features are cached to disk and can be reused by other commands. |
| 356 | +
|
| 357 | + Examples: |
| 358 | + # Featurize all datasets in a directory with ECFP |
| 359 | + themap featurize datasets/ -f ecfp |
| 360 | +
|
| 361 | + # Featurize with multiple featurizers |
| 362 | + themap featurize datasets/ -f ecfp -f maccs -f desc2D |
| 363 | +
|
| 364 | + # Featurize only training data |
| 365 | + themap featurize datasets/ -f ecfp --fold train |
| 366 | +
|
| 367 | + # Featurize a single file |
| 368 | + themap featurize datasets/train/CHEMBL123.jsonl.gz -f ecfp |
| 369 | +
|
| 370 | + # Force recomputation (ignore cache) |
| 371 | + themap featurize datasets/ -f ecfp --force |
| 372 | +
|
| 373 | + # Custom cache directory |
| 374 | + themap featurize datasets/ -f ecfp --cache-dir my_cache/ |
| 375 | + """ |
| 376 | + from .data.loader import DatasetLoader |
| 377 | + from .data.molecule_dataset import MoleculeDataset |
| 378 | + from .pipeline.featurization import FeaturizationPipeline |
| 379 | + |
| 380 | + data_path_obj = Path(data_path) |
| 381 | + cache_path = Path(cache_dir) |
| 382 | + featurizer_list: List[str] = list(featurizer) |
| 383 | + |
| 384 | + click.echo(f"Featurizing data from: {data_path}") |
| 385 | + click.echo(f"Featurizers: {', '.join(featurizer_list)}") |
| 386 | + click.echo(f"Cache directory: {cache_path}") |
| 387 | + |
| 388 | + try: |
| 389 | + # Determine if input is a file or directory |
| 390 | + if data_path_obj.is_file(): |
| 391 | + # Single file mode |
| 392 | + click.echo(f"\nProcessing single file: {data_path_obj.name}") |
| 393 | + task_id = data_path_obj.stem.replace(".jsonl", "") |
| 394 | + |
| 395 | + dataset = MoleculeDataset.load_from_file(data_path_obj) |
| 396 | + datasets = [dataset] |
| 397 | + dataset_names = [task_id] |
| 398 | + |
| 399 | + click.echo(f" Loaded {len(dataset)} molecules") |
| 400 | + else: |
| 401 | + # Directory mode |
| 402 | + loader = DatasetLoader(data_path_obj) |
| 403 | + stats = loader.get_statistics() |
| 404 | + |
| 405 | + click.echo(f"\nDataset directory: {stats['data_dir']}") |
| 406 | + |
| 407 | + datasets = [] |
| 408 | + dataset_names = [] |
| 409 | + |
| 410 | + # Determine which folds to process |
| 411 | + folds_to_process = ["train", "test", "valid"] if fold == "all" else [fold] |
| 412 | + |
| 413 | + for fold_name in folds_to_process: |
| 414 | + if fold_name not in stats.get("folds", {}): |
| 415 | + continue |
| 416 | + |
| 417 | + fold_stats = stats["folds"][fold_name] |
| 418 | + click.echo(f"\n{fold_name.capitalize()} fold: {fold_stats['task_count']} tasks") |
| 419 | + |
| 420 | + fold_datasets = loader.load_datasets(fold_name) |
| 421 | + for task_id, ds in fold_datasets.items(): |
| 422 | + datasets.append(ds) |
| 423 | + dataset_names.append(f"{fold_name}_{task_id}") |
| 424 | + |
| 425 | + if not datasets: |
| 426 | + click.echo("No datasets found to featurize.", err=True) |
| 427 | + raise SystemExit(1) |
| 428 | + |
| 429 | + click.echo(f"\nTotal datasets to featurize: {len(datasets)}") |
| 430 | + |
| 431 | + # Process each featurizer |
| 432 | + for feat_name in featurizer_list: |
| 433 | + click.echo(f"\n{'=' * 50}") |
| 434 | + click.echo(f"Featurizer: {feat_name}") |
| 435 | + click.echo(f"{'=' * 50}") |
| 436 | + |
| 437 | + pipeline = FeaturizationPipeline( |
| 438 | + cache_dir=cache_path, |
| 439 | + molecule_featurizer=feat_name, |
| 440 | + ) |
| 441 | + |
| 442 | + # Check cache status |
| 443 | + if not force: |
| 444 | + cached_count = 0 |
| 445 | + for ds in datasets: |
| 446 | + if pipeline.store.has_molecule_features(ds.task_id, feat_name): |
| 447 | + cached_count += 1 |
| 448 | + |
| 449 | + if cached_count > 0: |
| 450 | + click.echo(f" Found {cached_count}/{len(datasets)} datasets already cached") |
| 451 | + if cached_count == len(datasets): |
| 452 | + click.echo(" All datasets already cached. Use --force to recompute.") |
| 453 | + continue |
| 454 | + |
| 455 | + # Clear cache if force flag is set |
| 456 | + if force: |
| 457 | + click.echo(" Clearing existing cache...") |
| 458 | + pipeline.store.clear_cache(feat_name) |
| 459 | + |
| 460 | + # Featurize all datasets |
| 461 | + click.echo(f" Computing features for {len(datasets)} datasets...") |
| 462 | + |
| 463 | + with click.progressbar( |
| 464 | + zip(datasets, dataset_names), |
| 465 | + length=len(datasets), |
| 466 | + label=" Featurizing", |
| 467 | + ) as bar: |
| 468 | + success_count = 0 |
| 469 | + fail_count = 0 |
| 470 | + |
| 471 | + for ds, name in bar: |
| 472 | + try: |
| 473 | + # Check if already cached |
| 474 | + if not force and pipeline.store.has_molecule_features(ds.task_id, feat_name): |
| 475 | + success_count += 1 |
| 476 | + continue |
| 477 | + |
| 478 | + # Featurize |
| 479 | + pipeline.featurize_all_datasets([ds]) |
| 480 | + success_count += 1 |
| 481 | + except Exception as e: |
| 482 | + fail_count += 1 |
| 483 | + if ctx.obj.get("verbose"): |
| 484 | + click.echo(f"\n Error featurizing {name}: {e}", err=True) |
| 485 | + |
| 486 | + click.echo(f" Completed: {success_count} succeeded, {fail_count} failed") |
| 487 | + |
| 488 | + # Show cache location |
| 489 | + cache_subdir = cache_path / "molecules" / feat_name |
| 490 | + if cache_subdir.exists(): |
| 491 | + n_cached = len(list(cache_subdir.glob("*.npz"))) |
| 492 | + click.echo(f" Cached features: {cache_subdir} ({n_cached} files)") |
| 493 | + |
| 494 | + click.echo("\nFeaturization complete!") |
| 495 | + click.echo(f"Features cached at: {cache_path}") |
| 496 | + click.echo("\nTo use cached features in distance computation:") |
| 497 | + click.echo(f" themap quick {data_path} --featurizer {featurizer_list[0]}") |
| 498 | + |
| 499 | + except Exception as e: |
| 500 | + click.echo(f"Error: {e}", err=True) |
| 501 | + if ctx.obj.get("verbose"): |
| 502 | + import traceback |
| 503 | + |
| 504 | + traceback.print_exc() |
| 505 | + raise SystemExit(1) |
| 506 | + |
| 507 | + |
320 | 508 | def main() -> None: |
321 | 509 | """Entry point for CLI.""" |
322 | 510 | cli() |
|
0 commit comments