@@ -19,7 +19,7 @@ A Python library for calculating distances between chemical datasets to enable i
1919- [ Installation] ( #installation )
2020- [ Quick Start] ( #quick-start )
2121- [ Usage Examples] ( #usage-examples )
22- - [ Use Cases ] ( #use-cases )
22+ - [ Reproducing FS-Mol Experiments ] ( #reproducing-fs-mol-experiments )
2323- [ Documentation] ( #documentation )
2424- [ Contributing] ( #contributing )
2525- [ Citation] ( #citation )
@@ -92,109 +92,90 @@ pip install -e . --no-deps
9292
9393## Quick Start
9494
95- ### Basic Dataset Analysis
95+ ### Compute Dataset Distances
96+
97+ The simplest way to compute distances between molecular datasets:
9698
9799``` python
98- import os
99- from dpu_utils.utils.richpath import RichPath
100- from themap.data.molecule_dataset import MoleculeDataset
101-
102- # Load datasets
103- source_dataset_path = RichPath.create(os.path.join(" datasets" , " train" , " CHEMBL1023359.jsonl.gz" ))
104- source_dataset = MoleculeDataset.load_from_file(source_dataset_path)
105-
106- # Basic dataset analysis (works with minimal installation)
107- print (f " Dataset size: { len (source_dataset)} " )
108- print (f " Positive ratio: { source_dataset.get_ratio} " )
109- print (f " Dataset statistics: { source_dataset.get_statistics()} " )
110-
111- # Validate dataset integrity
112- try :
113- source_dataset.validate_dataset_integrity()
114- print (" ✅ Dataset is valid" )
115- except ValueError as e:
116- print (f " ❌ Dataset validation failed: { e} " )
117- ```
100+ from themap import quick_distance
118101
119- ### Molecular Embeddings
102+ results = quick_distance(
103+ data_dir = " datasets" , # Directory with train/ and test/ folders
104+ output_dir = " output" , # Where to save results
105+ molecule_featurizer = " ecfp" , # Fingerprint type (ecfp, maccs, etc.)
106+ molecule_method = " euclidean" , # Distance metric
107+ )
120108
121- ``` python
122- # Only works with pip install -e ".[ml]" or higher
123- from themap.data.molecule_dataset import MoleculeDataset
124- dataset_path = RichPath.create(os.path.join(" datasets" , " train" , " CHEMBL1023359.jsonl.gz" ))
125-
126- # Load dataset
127- dataset = MoleculeDataset.load_from_file(dataset_path)
128-
129- # Calculate molecular embeddings (requires ML dependencies)
130- try :
131- features = dataset.get_features(" ecfp" )
132- print (f " Features shape: { features.shape} " )
133- except ImportError :
134- print (" ❌ ML dependencies not installed. Use: pip install -e '.[ml]'" )
109+ # Results saved to output/molecule_distances.csv
135110```
136111
137- ### Distance Calculation
112+ ### Using a Config File
113+
114+ For reproducible experiments, use a YAML configuration:
138115
139116``` python
140- # Only works with pip install -e ".[all]"
141- from themap.data.tasks import Tasks, Task
142- from themap.distance import MoleculeDatasetDistance, ProteinDatasetDistance, TaskDistance
143-
144- # Create Tasks collection from your datasets
145- source_dataset_path = RichPath.create(os.path.join(" datasets" , " train" , " CHEMBL1023359.jsonl.gz" ))
146- source_dataset = MoleculeDataset.load_from_file(source_dataset_path)
147- target_dataset_path = RichPath.create(os.path.join(" datasets" , " test" , " CHEMBL2219358.jsonl.gz" ))
148- target_dataset = MoleculeDataset.load_from_file(target_dataset_path)
149- source_task = Task(task_id = " CHEMBL1023359" , molecule_dataset = source_dataset)
150- target_task = Task(task_id = " CHEMBL2219358" , molecule_dataset = target_dataset)
151-
152- # Step 1: Create Tasks collection with train/test split
153- tasks = Tasks(train_tasks = [source_task], test_tasks = [target_task])
154-
155- # Step 2: Compute molecule distance with method-specific configuration
156- try :
157- # Use different methods for different data types
158- mol_dist = MoleculeDatasetDistance(
159- tasks = tasks,
160- molecule_method = " otdd" , # OTDD for molecules
161- )
162- mol_dist._compute_features()
163- distance = mol_dist.get_distance()
164- print (distance)
165-
166- except ImportError :
167- print (" ❌ Distance calculation dependencies not installed. Use: pip install -e '.[all]'" )
117+ from themap import run_pipeline
118+
119+ results = run_pipeline(" config.yaml" )
168120```
169121
122+ Example ` config.yaml ` :
123+ ``` yaml
124+ data :
125+ directory : " datasets"
170126
171- ## Usage Examples
127+ molecule :
128+ enabled : true
129+ featurizer : " ecfp"
130+ method : " euclidean"
172131
173- ### Transfer Learning Dataset Selection
174- ``` python
175- # Find the most similar training datasets for your target task
176- candidate_datasets = [" CHEMBL1023359" , " CHEMBL2219358" , " CHEMBL1243967" ]
177- target_dataset = " my_target_assay"
132+ output :
133+ directory : " output"
134+ format : " csv"
135+ ` ` `
136+
137+ ### Data Format
138+
139+ Organize your data in this structure:
178140
179- distances = calculate_all_distances(candidate_datasets, target_dataset)
180- best_source = min (distances, key = distances.get) # Closest dataset for transfer learning
141+ ` ` `
142+ datasets/
143+ ├── train/ # Source datasets
144+ │ ├── CHEMBL123456.jsonl.gz
145+ │ └── ...
146+ └── test/ # Target datasets
147+ ├── CHEMBL111111.jsonl.gz
148+ └── ...
181149```
182150
183- ### Domain Adaptation Assessment
184- ``` python
185- # Assess how much domain shift exists between datasets
186- domain_gap = calculate_dataset_distance(source_domain, target_domain)
187- if domain_gap < threshold:
188- print (" Direct transfer likely to work well" )
189- else :
190- print (" Domain adaptation strategies recommended" )
151+ Each ` .jsonl.gz ` file contains molecules in JSON lines format:
152+ ``` json
153+ {"SMILES" : " CCO" , "Property" : 1 }
154+ {"SMILES" : " CCCO" , "Property" : 0 }
191155```
192156
193- ### Task Hardness Prediction
157+
158+ ## Usage Examples
159+
160+ ### Analyzing Distance Results
161+
194162``` python
195- # Predict task difficulty based on dataset characteristics
196- hardness_score = estimate_task_hardness(dataset, reference_datasets)
197- print (f " Predicted task difficulty: { hardness_score} " )
163+ import pandas as pd
164+
165+ # Load computed distances
166+ distances = pd.read_csv(" output/molecule_distances.csv" , index_col = 0 )
167+
168+ # Find closest source for each target (transfer learning selection)
169+ for target in distances.columns:
170+ closest = distances[target].idxmin()
171+ dist = distances[target].min()
172+ print (f " { target} <- { closest} (distance: { dist:.4f } ) " )
173+
174+ # Estimate task hardness (average distance to k-nearest sources)
175+ k = 3
176+ for target in distances.columns:
177+ hardness = distances[target].nsmallest(k).mean()
178+ print (f " Task hardness for { target} : { hardness:.4f } " )
198179```
199180
200181## Reproducing FS-Mol Experiments
@@ -204,7 +185,7 @@ Pre-computed molecular embeddings and distance matrices for the FS-Mol dataset a
204185### Setup
2051861 . Download data from [ Zenodo] ( https://zenodo.org/records/10605093 )
2061872 . Extract to ` datasets/fsmol_hardness/ `
207- 3 . Run the provided Jupyter notebooks in the ` notebooks /` directory
188+ 3 . See ` examples /` directory for usage examples
208189
209190## Documentation
210191
@@ -261,11 +242,8 @@ If you use THEMAP in your research, please cite our paper:
261242
262243This project is licensed under the MIT License - see the [ LICENSE] ( LICENSE ) file for details.
263244
264- ## 🤝 Support
265-
266- - 📖 [ Documentation] ( https://hfooladi.github.io/THEMAP/ )
267- - 🐛 [ Issue Tracker] ( https://github.com/HFooladi/THEMAP/issues )
268- - 💬 [ Discussions] ( https://github.com/HFooladi/THEMAP/discussions )
269- ---
245+ ## Support
270246
271- ** Ready to optimize your chemical dataset selection for machine learning?** Start with THEMAP today! 🚀
247+ - [ Documentation] ( https://hfooladi.github.io/THEMAP/ )
248+ - [ Issue Tracker] ( https://github.com/HFooladi/THEMAP/issues )
249+ - [ Discussions] ( https://github.com/HFooladi/THEMAP/discussions )
0 commit comments