HFooladi
diff --git a/‎.github/dependabot.yml‎
Lines changed: 1 addition & 10 deletions b/‎.github/dependabot.yml‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎.gitignore‎
Lines changed: 34 additions & 0 deletions b/‎.gitignore‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎.readthedocs.yaml‎
Lines changed: 13 additions & 16 deletions b/‎.readthedocs.yaml‎
Lines changed: 13 additions & 16 deletions
diff --git a/‎README.md‎
Lines changed: 72 additions & 94 deletions b/‎README.md‎
Lines changed: 72 additions & 94 deletions
diff --git a/‎configs/pipeline_example.yaml‎
Lines changed: 34 additions & 0 deletions b/‎configs/pipeline_example.yaml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎datasets/sample_tasks_list.json‎
Lines changed: 1 addition & 1 deletion b/‎datasets/sample_tasks_list.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datasets/test/test_proteins.fasta‎
Lines changed: 1 addition & 1 deletion b/‎datasets/test/test_proteins.fasta‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datasets/train/train_proteins.fasta‎
Lines changed: 1 addition & 1 deletion b/‎datasets/train/train_proteins.fasta‎
Lines changed: 1 addition & 1 deletion
@@ -1,19 +1,10 @@
 version: 2
 updates:
-  - package-ecosystem: "pip"
-    directory: "/"
-    schedule:
-      interval: "weekly"
-    open-pull-requests-limit: 10
-    labels:
-      - "dependencies"
-      - "python"
-
   - package-ecosystem: "github-actions"
     directory: "/"
     schedule:
       interval: "weekly"
-    open-pull-requests-limit: 10
+    open-pull-requests-limit: 5
     labels:
       - "dependencies"
       - "github_actions"
@@ -184,3 +184,37 @@ results/
 
 # ignore CLAUDE file
 CLAUDE.md
+
+# ignore .ruff_cache
+.ruff_cache/
+
+# ignore .coverage
+.coverage/
+
+# ignore .pytest_cache
+.pytest_cache/
+
+# ignore .mypy_cache
+.mypy_cache/
+
+# ignore .ipynb_checkpoints
+.ipynb_checkpoints/
+
+# ignore .pytest_cache
+.pytest_cache/
+
+# ignore all the output folders
+output/
+output_cache/
+output_results/
+output_cache/
+test_output/
+cli_output/
+
+# ignore all the cache folders
+task_distance_cache/
+
+# ignore embeddings and generated cache files
+datasets/embeddings/
+datasets/protein_features_cache.pkl
+datasets/processing_summary.json
@@ -1,23 +1,20 @@
-# .readthedocs.yml
+# .readthedocs.yaml
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 
-# Required
 version: 2
 
-# Optionally set the version of Python and requirements required to build your docs
-python:
-  version: "3.8"
-  install:
-      - method: pip
-        path: .
-        extra_requirements:
-          - rtd
-      - requirements: docs/requirements.txt
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.10"
 
+mkdocs:
+  configuration: mkdocs.yml
 
-# Build documentation in the docs/ directory with Sphinx
-sphinx:
-  builder: html
-  configuration: docs/source/conf.py
-  fail_on_warning: true
+python:
+  install:
+    - method: pip
+      path: .
+      extra_requirements:
+        - docs
@@ -19,7 +19,7 @@ A Python library for calculating distances between chemical datasets to enable i
 - [Installation](#installation)
 - [Quick Start](#quick-start)
 - [Usage Examples](#usage-examples)
-- [Use Cases](#use-cases)
+- [Reproducing FS-Mol Experiments](#reproducing-fs-mol-experiments)
 - [Documentation](#documentation)
 - [Contributing](#contributing)
 - [Citation](#citation)
@@ -92,109 +92,90 @@ pip install -e . --no-deps
 
 ## Quick Start
 
-### Basic Dataset Analysis
+### Compute Dataset Distances
+
+The simplest way to compute distances between molecular datasets:
 
 ```python
-import os
-from dpu_utils.utils.richpath import RichPath
-from themap.data.molecule_dataset import MoleculeDataset
-
-# Load datasets
-source_dataset_path = RichPath.create(os.path.join("datasets", "train", "CHEMBL1023359.jsonl.gz"))
-source_dataset = MoleculeDataset.load_from_file(source_dataset_path)
-
-# Basic dataset analysis (works with minimal installation)
-print(f"Dataset size: {len(source_dataset)}")
-print(f"Positive ratio: {source_dataset.get_ratio}")
-print(f"Dataset statistics: {source_dataset.get_statistics()}")
-
-# Validate dataset integrity
-try:
-    source_dataset.validate_dataset_integrity()
-    print("✅ Dataset is valid")
-except ValueError as e:
-    print(f"❌ Dataset validation failed: {e}")
-```
+from themap import quick_distance
 
-### Molecular Embeddings
+results = quick_distance(
+    data_dir="datasets",          # Directory with train/ and test/ folders
+    output_dir="output",          # Where to save results
+    molecule_featurizer="ecfp",   # Fingerprint type (ecfp, maccs, etc.)
+    molecule_method="euclidean",  # Distance metric
+)
 
-```python
-# Only works with pip install -e ".[ml]" or higher
-from themap.data.molecule_dataset import MoleculeDataset
-dataset_path = RichPath.create(os.path.join("datasets", "train", "CHEMBL1023359.jsonl.gz"))
-
-# Load dataset
-dataset = MoleculeDataset.load_from_file(dataset_path)
-
-# Calculate molecular embeddings (requires ML dependencies)
-try:
-    features = dataset.get_features("ecfp")
-    print(f"Features shape: {features.shape}")
-except ImportError:
-    print("❌ ML dependencies not installed. Use: pip install -e '.[ml]'")
+# Results saved to output/molecule_distances.csv
 ```
 
-### Distance Calculation
+### Using a Config File
+
+For reproducible experiments, use a YAML configuration:
 
 ```python
-# Only works with pip install -e ".[all]"
-from themap.data.tasks import Tasks, Task
-from themap.distance import MoleculeDatasetDistance, ProteinDatasetDistance, TaskDistance
-
-# Create Tasks collection from your datasets
-source_dataset_path = RichPath.create(os.path.join("datasets", "train", "CHEMBL1023359.jsonl.gz"))
-source_dataset = MoleculeDataset.load_from_file(source_dataset_path)
-target_dataset_path = RichPath.create(os.path.join("datasets", "test", "CHEMBL2219358.jsonl.gz"))
-target_dataset = MoleculeDataset.load_from_file(target_dataset_path)
-source_task = Task(task_id="CHEMBL1023359", molecule_dataset=source_dataset)
-target_task = Task(task_id="CHEMBL2219358", molecule_dataset=target_dataset)
-
-# Step 1: Create Tasks collection with train/test split
-tasks = Tasks(train_tasks=[source_task], test_tasks=[target_task])
-
-# Step 2: Compute molecule distance with method-specific configuration
-try:
-    # Use different methods for different data types
-    mol_dist = MoleculeDatasetDistance(
-        tasks=tasks,
-        molecule_method="otdd",     # OTDD for molecules
-    )
-    mol_dist._compute_features()
-    distance = mol_dist.get_distance()
-    print(distance)
-
-except ImportError:
-    print("❌ Distance calculation dependencies not installed. Use: pip install -e '.[all]'")
+from themap import run_pipeline
+
+results = run_pipeline("config.yaml")
 ```
 
+Example `config.yaml`:
+```yaml
+data:
+  directory: "datasets"
 
-## Usage Examples
+molecule:
+  enabled: true
+  featurizer: "ecfp"
+  method: "euclidean"
 
-### Transfer Learning Dataset Selection
-```python
-# Find the most similar training datasets for your target task
-candidate_datasets = ["CHEMBL1023359", "CHEMBL2219358", "CHEMBL1243967"]
-target_dataset = "my_target_assay"
+output:
+  directory: "output"
+  format: "csv"
+```
+
+### Data Format
+
+Organize your data in this structure:
 
-distances = calculate_all_distances(candidate_datasets, target_dataset)
-best_source = min(distances, key=distances.get)  # Closest dataset for transfer learning
+```
+datasets/
+├── train/                        # Source datasets
+│   ├── CHEMBL123456.jsonl.gz
+│   └── ...
+└── test/                         # Target datasets
+    ├── CHEMBL111111.jsonl.gz
+    └── ...
 ```
 
-### Domain Adaptation Assessment
-```python
-# Assess how much domain shift exists between datasets
-domain_gap = calculate_dataset_distance(source_domain, target_domain)
-if domain_gap < threshold:
-    print("Direct transfer likely to work well")
-else:
-    print("Domain adaptation strategies recommended")
+Each `.jsonl.gz` file contains molecules in JSON lines format:
+```json
+{"SMILES": "CCO", "Property": 1}
+{"SMILES": "CCCO", "Property": 0}
 ```
 
-### Task Hardness Prediction
+
+## Usage Examples
+
+### Analyzing Distance Results
+
 ```python
-# Predict task difficulty based on dataset characteristics
-hardness_score = estimate_task_hardness(dataset, reference_datasets)
-print(f"Predicted task difficulty: {hardness_score}")
+import pandas as pd
+
+# Load computed distances
+distances = pd.read_csv("output/molecule_distances.csv", index_col=0)
+
+# Find closest source for each target (transfer learning selection)
+for target in distances.columns:
+    closest = distances[target].idxmin()
+    dist = distances[target].min()
+    print(f"{target} <- {closest} (distance: {dist:.4f})")
+
+# Estimate task hardness (average distance to k-nearest sources)
+k = 3
+for target in distances.columns:
+    hardness = distances[target].nsmallest(k).mean()
+    print(f"Task hardness for {target}: {hardness:.4f}")
 ```
 
 ## Reproducing FS-Mol Experiments
@@ -204,7 +185,7 @@ Pre-computed molecular embeddings and distance matrices for the FS-Mol dataset a
 ### Setup
 1. Download data from [Zenodo](https://zenodo.org/records/10605093)
 2. Extract to `datasets/fsmol_hardness/`
-3. Run the provided Jupyter notebooks in the `notebooks/` directory
+3. See `examples/` directory for usage examples
 
 ## Documentation
 
@@ -261,11 +242,8 @@ If you use THEMAP in your research, please cite our paper:
 
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
 
-## 🤝 Support
-
- - 📖 [Documentation](https://hfooladi.github.io/THEMAP/)
- - 🐛 [Issue Tracker](https://github.com/HFooladi/THEMAP/issues)
- - 💬 [Discussions](https://github.com/HFooladi/THEMAP/discussions)
----
+## Support
 
-**Ready to optimize your chemical dataset selection for machine learning?** Start with THEMAP today! 🚀
+- [Documentation](https://hfooladi.github.io/THEMAP/)
+- [Issue Tracker](https://github.com/HFooladi/THEMAP/issues)
+- [Discussions](https://github.com/HFooladi/THEMAP/discussions)
@@ -0,0 +1,34 @@
+# THEMAP Pipeline Configuration Example
+# Run with: themap run configs/pipeline_example.yaml
+
+data:
+  directory: "datasets"      # Path to dataset directory
+  task_list: null            # Optional: task list JSON file (auto-discover if null)
+
+distances:
+  molecule:
+    enabled: true
+    featurizer: "ecfp"       # Options: ecfp, maccs, desc2D, mordred, ChemBERTa-77M-MLM, etc.
+    method: "euclidean"      # Options: euclidean, cosine, otdd
+
+  protein:
+    enabled: false           # Set to true if you have protein FASTA files
+    featurizer: "esm2_t33_650M_UR50D"
+    method: "cosine"         # Options: euclidean, cosine, manhattan
+    layer: null              # Auto-detect based on model
+
+combination:
+  strategy: "weighted_average"  # Options: average, weighted_average, separate
+  weights:
+    molecule: 0.7
+    protein: 0.3
+
+output:
+  directory: "output/"
+  format: "csv"              # Options: csv, json, npz
+  save_features: true        # Cache features for reuse
+
+compute:
+  n_jobs: 8                  # Parallel workers
+  batch_size: 1000           # Batch size for featurization
+  device: "auto"             # Options: auto, cpu, cuda
@@ -1 +1 @@
-{"train": ["CHEMBL894522", "CHEMBL1023359", "CHEMBL2218944", "CHEMBL2219012", "CHEMBL3371729", "CHEMBL3705844", "CHEMBL3866221", "CHEMBL4224224"], "valid": [], "test": ["CHEMBL2219236", "CHEMBL2219358"]}
+{"train": ["CHEMBL894522", "CHEMBL1023359", "CHEMBL2218944", "CHEMBL2219012", "CHEMBL3371729", "CHEMBL3705844", "CHEMBL3866221", "CHEMBL4224224"], "valid": [], "test": ["CHEMBL2219236", "CHEMBL2219358", "CHEMBL1963831"]}
@@ -3,4 +3,4 @@ MSLHFLYYCSEPTLDVKIAFCQGFDKQVDVSYIAKHYNMSKSKVDNQFYSVEVGDSTFTVLKRYQNLKPIGSGAQGIVCA
 >sp|Q13177|PAK2_HUMAN
 MSDNGELEDKPPAPPVRMSSTIFSTGGKDPLSANHSLKPLPSVPEEKKPRHKIISIFSGTEKGSKKKEKERPEISPPSDFEHTIHVGFDAVTGEFTGMPEQWARLLQTSNITKLEQKKNPQAVLDVLKFYDSNTVKQKYLSFTPPEKDGFPSGTPALNAKGTEAPAVVTEEEDDDEETAPPVIAPRPDHTKSIYTRSVIDPVPAPVGDSHVDGAAKSLDKQKKKTKMTDEEIMEKLRTIVSIGDPKKKYTRYEKIGQGASGTVFTATDVALGQEVAIKQINLQKQPKKELIINEILVMKELKNPNIVNFLDSYLVGDELFVVMEYLAGGSLTDVVTETCMDEAQIAAVCRECLQALEFLHANQVIHRDIKSDNVLLGMEGSVKLTDFGFCAQITPEQSKRSTMVGTPYWMAPEVVTRKAYGPKVDIWSLGIMAIEMVEGEPPYLNENPLRALYLIATNGTPELQNPEKLSPIFRDFLNRCLEMDVEKRGSAKELLQHPFLKLAKPLSSLTPLIMAAKEAMKSNR
 >sp|P50750|CDK9_HUMAN
-MAKQYDSVECPFCDEVSKYEKLAKIGQGTFGEVFKARHRKTGQKVALKKVLMENEKEGFPITALREIKILQLLKHENVVNLIEICRTKASPYNRCKGSIYLVFDFCEHDLAGLLSNVLVKFTLSEIKRVMQMLLNGLYYIHRNKILHRDMKAANVLITRDGVLKLADFGLARAFSLAKNSQPNRYTNRVVTLWYRPPELLLGERDYGPPIDLWGAGCIMAEMWTRSPIMQGNTEQHQLALISQLCGSITPEVWPNVDNYELYEKLELVKGQKRKVKDRLKAYVRDPYALDLIDKLLVLDPAQRIDSDDALNHDFFWSDPMPSDLKGMLSTHLTSMFEYLAPPRRKGSQITQQSTNQSRNPATTNQTEFERVF
+MAKQYDSVECPFCDEVSKYEKLAKIGQGTFGEVFKARHRKTGQKVALKKVLMENEKEGFPITALREIKILQLLKHENVVNLIEICRTKASPYNRCKGSIYLVFDFCEHDLAGLLSNVLVKFTLSEIKRVMQMLLNGLYYIHRNKILHRDMKAANVLITRDGVLKLADFGLARAFSLAKNSQPNRYTNRVVTLWYRPPELLLGERDYGPPIDLWGAGCIMAEMWTRSPIMQGNTEQHQLALISQLCGSITPEVWPNVDNYELYEKLELVKGQKRKVKDRLKAYVRDPYALDLIDKLLVLDPAQRIDSDDALNHDFFWSDPMPSDLKGMLSTHLTSMFEYLAPPRRKGSQITQQSTNQSRNPATTNQTEFERVF
@@ -17,4 +17,4 @@ MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDSSKPAVYNYPEGAAYEFNAAAAANAQVYGQTGLPY
 >SP|O75460|ERN1_HUMAN
 MPARRLLLLLTLLLPGLGIFGSTSTVTLPETLLFVSTLDGSLHAVSKRTGSIKWTLKEDPVLQVPTHVEEPAFLPDPNDGSLYTLGSKNNEGLTKLPFTIPELVQASPCRSSDGILYMGKKQDIWYVIDLLTGEKQQTLSSAFADSLCPSTSLLYLGRTEYTITMYDTKTRELRWNATYFDYAASLPEDDVDYKMSHFVSNGDGLVVTVDSESGDVLWIQNYASPVVAFYVWQREGLRKVMHINVAVETLRYLTFMSGEVGRITKWKYPFPKETEAKSKLTPTLYVGKYSTSLYASPSMVHEGVAVVPRGSTLPLLEGPQTDGVTIGDKGECVITPSTDVKFDPGLKSKNKLNYLRNYWLLIGHHETPLSASTKMLERFPNNLPKHRENVIPADSEKKSFEEVINLVDQTSENAPTTVSRDVEEKPAHAPARPEAPVDSMLKDMATIILSTFLLIGWVAFIITYPLSMHQQQQLQHQQFQKELEKIQLLQQQQQQLPFHPPGDTAQDGELLDTSGPYSESSGTSSPSTSPRASNHSLCSGSSASKAGSSPSLEQDDGDEETSVVIVGKISFCPKDVLGHGAEGTIVYRGMFDNRDVAVKRILPECFSFADREVQLLRESDEHPNVIRYFCTEKDRQFQYIAIELCAATLQEYVEQKDFAHLGLEPITLLQQTTSGLAHLHSLNIVHRDLKPHNILISMPNAHGKIKAMISDFGLCKKLAVGRHSFSRRSGVPGTEGWIAPEMLSEDCKENPTYTVDIFSAGCVFYYVISEGSHPFGKSLQRQANILLGACSLDCLHPEKHEDVIARELIEKMIAMDPQKRPSAKHVLKHPFFWSLEKQLQFFQDVSDRIEKESLDGPIVKQLERGGRAVVKMDWRENITVPLQTDLRKFRTYKGGSVRDLLRAMRNKKHHYRELPAEVRETLGSLPDDFVCYFTSRFPHLLAHTYRAMELCSHERLFQPYYFHEPPEPQPPVTPDAL
 >SP|Q16581|C3AR_HUMAN
-MASFSAETNSTDLLSQPWNEPPVILSMVILSLTFLLGLPGNGLVLWVAGLKMQRTVNTIWFLHLTLADLLCCLSLPFSLAHLALQGQWPYGRFLCKLIPSIIVLNMFASVFLLTAISLDRCLVVFKPIWCQNHRNVGMACSICGCIWVVAFVMCIPVFVYREIFTTDNHNRCGYKFGLSSSLDYPDFYGDPLENRSLENIVQPPGEMNDRLDPSSFQTNDHPWTVPTVFQPQTFQRPSADSLPRGSARLTSQNLYSNVFKPADVVSPKIPSGFPIEDHETSPLDNSDAFLSTHLKLFPSASSNSFYESELPQGFQDYYNLGQFTDDDQVPTPLVAITITRLVVGFLLPSVIMIACYSFIVFRMQRGRFAKSQSKTFRVAVVVVAVFLVCWTPYHIFGVLSLLTDPETPLGKTLMSWDHVCIALASANSCFNPFLYALLGKDFRKKARQSIQGILEAAFSEELTRSTHCPSNNVISERNSTTV
+MASFSAETNSTDLLSQPWNEPPVILSMVILSLTFLLGLPGNGLVLWVAGLKMQRTVNTIWFLHLTLADLLCCLSLPFSLAHLALQGQWPYGRFLCKLIPSIIVLNMFASVFLLTAISLDRCLVVFKPIWCQNHRNVGMACSICGCIWVVAFVMCIPVFVYREIFTTDNHNRCGYKFGLSSSLDYPDFYGDPLENRSLENIVQPPGEMNDRLDPSSFQTNDHPWTVPTVFQPQTFQRPSADSLPRGSARLTSQNLYSNVFKPADVVSPKIPSGFPIEDHETSPLDNSDAFLSTHLKLFPSASSNSFYESELPQGFQDYYNLGQFTDDDQVPTPLVAITITRLVVGFLLPSVIMIACYSFIVFRMQRGRFAKSQSKTFRVAVVVVAVFLVCWTPYHIFGVLSLLTDPETPLGKTLMSWDHVCIALASANSCFNPFLYALLGKDFRKKARQSIQGILEAAFSEELTRSTHCPSNNVISERNSTTV
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-{"train": ["CHEMBL894522", "CHEMBL1023359", "CHEMBL2218944", "CHEMBL2219012", "CHEMBL3371729", "CHEMBL3705844", "CHEMBL3866221", "CHEMBL4224224"], "valid": [], "test": ["CHEMBL2219236", "CHEMBL2219358"]}`
	`1`	`+{"train": ["CHEMBL894522", "CHEMBL1023359", "CHEMBL2218944", "CHEMBL2219012", "CHEMBL3371729", "CHEMBL3705844", "CHEMBL3866221", "CHEMBL4224224"], "valid": [], "test": ["CHEMBL2219236", "CHEMBL2219358", "CHEMBL1963831"]}`