HFooladi
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 5 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 18 additions & 22 deletions b/‎pyproject.toml‎
Lines changed: 18 additions & 22 deletions
diff --git a/‎tests/conftest.py‎
Lines changed: 11 additions & 6 deletions b/‎tests/conftest.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎tests/data/test_molecule_datapoint.py‎
Lines changed: 8 additions & 11 deletions b/‎tests/data/test_molecule_datapoint.py‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎tests/data/test_molecule_dataset.py‎
Lines changed: 16 additions & 18 deletions b/‎tests/data/test_molecule_dataset.py‎
Lines changed: 16 additions & 18 deletions
@@ -1,10 +1,10 @@
 repos:
   # Ruff - replaces flake8, isort, black, and more
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.4  # Use latest stable version
+    rev: v0.12.4
     hooks:
       - id: ruff
-        args: [--fix]  # Automatically fix what can be fixed
+        args: [--fix]  # Add back --fix for pre-commit
       - id: ruff-format
 
   # Type checking
@@ -13,8 +13,7 @@ repos:
     hooks:
       - id: mypy
         additional_dependencies: [types-requests]
-        args: [--ignore-missing-imports]
-        exclude: ^(scripts/|third_party/|themap/models/otdd/)
+        # Remove --config-file arg, mypy reads pyproject.toml automatically
 
   # Basic file checks
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -27,7 +26,7 @@ repos:
       - id: check-merge-conflict
       - id: check-case-conflict
       - id: check-added-large-files
-        args: ['--maxkb=1000']  # Prevent large files
+        args: ['--maxkb=1000']
 
   # Optional: Run tests (can be slow, consider making it optional)
   # Uncomment if you want tests to run on every commit
 
@@ -19,7 +19,7 @@ dependencies = [
     "numpy",
     "pandas",
     "matplotlib",
-    "seaborn", 
+    "seaborn",
     "scikit-learn",
     "dpu-utils>=0.2.13",
     "rdkit",
@@ -60,7 +60,7 @@ protein = [
   "esm",
 ]
 
-# Optimal transport distances  
+# Optimal transport distances
 otdd = [
   "torch==2.4.0",
   "torchvision==0.19.0",
@@ -78,7 +78,7 @@ all = [
   "torchvision==0.19.0",
   "torchaudio==2.4.0",
   "molfeat==0.11.0",
-  "dgl<=2.0", 
+  "dgl<=2.0",
   "dgllife>=0.3.2",
   "pytorch_geometric",
   "fcd_torch",
@@ -152,15 +152,23 @@ exclude = [
 ]
 
 [tool.ruff]
-# Select rule codes to enforce. "E" and "F" are defaults. Add "I" for import sorting.
-# You can add more codes like "B" (flake8-bugbear) or "C4" (flake8-comprehensions) etc.
-lint.select = ["E", "F", "W", "I"] # W = warnings, I = isort
-lint.ignore = [
-    "E501", # Line length handled by formatter
-]
+# Enable auto-fixing
+fix = true
+
+# Set line length
 line-length = 110
+
+# Set target version
 target-version = "py310"
 
+# Select rule codes to enforce. "E" and "F" are defaults. Add "I" for import sorting.
+# You can add more codes like "B" (flake8-bugbear) or "C4" (flake8-comprehensions) etc.
+[tool.ruff.lint]
+
+select = ["E", "F", "W", "I"] # W = warnings, I = isort
+ignore = ["E501"] # Line length handled by formatter
+
+
 [tool.ruff.format]
 # Use double quotes for strings
 quote-style = "double"
@@ -199,19 +207,6 @@ output = "coverage.xml"
 
 [tool.mypy]
 python_version = "3.10"
-warn_return_any = true
-warn_unused_configs = true
-disallow_untyped_defs = true
-disallow_incomplete_defs = true
-check_untyped_defs = true
-disallow_untyped_decorators = true
-no_implicit_optional = true
-warn_redundant_casts = true
-warn_unused_ignores = true
-warn_no_return = true
-warn_unreachable = true
-strict_equality = true
-show_error_codes = true
 
 [[tool.mypy.overrides]]
 module = [
@@ -220,5 +215,6 @@ module = [
     "molfeat.*",
     "dpu_utils.*",
     "themap.models.otdd.*",
+    "tests.*",
 ]
 ignore_missing_imports = true
@@ -1,9 +1,10 @@
-import numpy as np
 import pandas as pd
 import pytest
-
 from dpu_utils.utils import RichPath
-from themap.data import MoleculeDatapoint, ProteinDataset, MoleculeDataset
+
+from themap.data.molecule_datapoint import MoleculeDatapoint
+from themap.data.protein_datasets import ProteinDataset
+
 
 @pytest.fixture
 def manual_smiles():
@@ -44,25 +45,29 @@ def dataset_CHEMBL2219236():
 def dataset_CHEMBL1963831():
     return RichPath.create("datasets/test/CHEMBL1963831.jsonl.gz")
 
+
 @pytest.fixture
 def dataset_CHEMBL1023359():
     return RichPath.create("datasets/test/CHEMBL1023359.jsonl.gz")
 
+
 @pytest.fixture
 def dataset_CHEMBL2219358():
     return RichPath.create("datasets/test/CHEMBL2219358.jsonl.gz")
 
+
 @pytest.fixture
 def dataset_CHEMBL1963831_csv():
     return pd.read_csv("tests/conftest/CHEMBL1963831.csv")
 
+
 @pytest.fixture
 def manual_protein_dataset():
     return ProteinDataset(
         task_id=["CHEMBL2219236", "CHEMBL2219358"],
-        protein={"Q13177" : "MSDNGELEDKPPAPPVRMSSTI",
-                 "P50750" : "MAKQYDSVECPFCDEVSKYEK"}
-        )
+        protein={"Q13177": "MSDNGELEDKPPAPPVRMSSTI", "P50750": "MAKQYDSVECPFCDEVSKYEK"},
+    )
+
 
 @pytest.fixture
 def protein_dataset_train():
 
@@ -2,6 +2,7 @@
 
 from themap.data.molecule_datapoint import MoleculeDatapoint
 
+
 def test_MoleculeDatapoint(datapoint_molecule):
     """Test the MoleculeDatapoint class functionality."""
     # Test the __repr__ method
@@ -19,15 +20,11 @@ def test_MoleculeDatapoint(datapoint_molecule):
     # Test the molecular_weight method
     assert round(datapoint_molecule.molecular_weight) == 78
 
+
 def test_MoleculeDatapoint_validation():
     """Test input validation in MoleculeDatapoint."""
     # Test valid initialization
-    datapoint = MoleculeDatapoint(
-        task_id="test_task",
-        smiles="c1ccccc1",
-        bool_label=True,
-        numeric_label=1.0
-    )
+    datapoint = MoleculeDatapoint(task_id="test_task", smiles="c1ccccc1", bool_label=True, numeric_label=1.0)
     assert datapoint.task_id == "test_task"
     assert datapoint.smiles == "c1ccccc1"
     assert datapoint.bool_label is True
@@ -38,23 +35,23 @@ def test_MoleculeDatapoint_validation():
         MoleculeDatapoint(
             task_id=123,  # Should be string
             smiles="c1ccccc1",
-            bool_label=True
+            bool_label=True,
         )
 
     # Test invalid smiles
     with pytest.raises(TypeError):
         MoleculeDatapoint(
             task_id="test_task",
             smiles=123,  # Should be string
-            bool_label=True
+            bool_label=True,
         )
 
     # Test invalid bool_label
     with pytest.raises(TypeError):
         MoleculeDatapoint(
             task_id="test_task",
             smiles="c1ccccc1",
-            bool_label=1  # Should be bool
+            bool_label=1,  # Should be bool
         )
 
     # Test invalid numeric_label
@@ -63,5 +60,5 @@ def test_MoleculeDatapoint_validation():
             task_id="test_task",
             smiles="c1ccccc1",
             bool_label=True,
-            numeric_label="invalid"  # Should be number or None
-        ) 
+            numeric_label="invalid",  # Should be number or None
+        )
@@ -1,8 +1,9 @@
-import pytest
 import numpy as np
+import pytest
 
-from themap.data.molecule_dataset import MoleculeDataset
 from themap.data.molecule_datapoint import MoleculeDatapoint
+from themap.data.molecule_dataset import MoleculeDataset
+
 
 def test_MoleculeDataset_load_from_file(dataset_CHEMBL2219236):
     """Test loading MoleculeDataset from file."""
@@ -18,18 +19,12 @@ def test_MoleculeDataset_load_from_file(dataset_CHEMBL2219236):
     # Test the __repr__ method
     assert str(dataset) == "MoleculeDataset(task_id=CHEMBL2219236, task_size=157)"
 
+
 def test_MoleculeDataset_validation():
     """Test input validation in MoleculeDataset."""
     # Test valid initialization
     dataset = MoleculeDataset(
-        task_id="test_task",
-        data=[
-            MoleculeDatapoint(
-                task_id="test_task",
-                smiles="c1ccccc1",
-                bool_label=True
-            )
-        ]
+        task_id="test_task", data=[MoleculeDatapoint(task_id="test_task", smiles="c1ccccc1", bool_label=True)]
     )
     assert dataset.task_id == "test_task"
     assert len(dataset) == 1
@@ -38,30 +33,31 @@ def test_MoleculeDataset_validation():
     with pytest.raises(TypeError):
         MoleculeDataset(
             task_id=123,  # Should be string
-            data=[]
+            data=[],
         )
 
     # Test invalid data
     with pytest.raises(TypeError):
         MoleculeDataset(
             task_id="test_task",
-            data="not_a_list"  # Should be list
+            data="not_a_list",  # Should be list
         )
 
     # Test invalid data items
     with pytest.raises(TypeError):
         MoleculeDataset(
             task_id="test_task",
-            data=["not_a_MoleculeDatapoint"]  # Should be MoleculeDatapoint
+            data=["not_a_MoleculeDatapoint"],  # Should be MoleculeDatapoint
         )
 
+
 def test_MoleculeDataset_properties():
     """Test MoleculeDataset properties."""
     # Create a test dataset
     datapoints = [
         MoleculeDatapoint("test_task", "c1ccccc1", True),
         MoleculeDatapoint("test_task", "c1ccccc1", False),
-        MoleculeDatapoint("test_task", "c1ccccc1", True)
+        MoleculeDatapoint("test_task", "c1ccccc1", True),
     ]
     dataset = MoleculeDataset("test_task", datapoints)
 
@@ -83,13 +79,14 @@ def test_MoleculeDataset_properties():
     # Test get_ratio property
     assert dataset.get_ratio == 0.67  # 2/3 rounded to 2 decimal places
 
+
 def test_MoleculeDataset_filter():
     """Test MoleculeDataset filtering."""
     # Create a test dataset
     datapoints = [
         MoleculeDatapoint("test_task", "c1ccccc1", True),
         MoleculeDatapoint("test_task", "c1ccccc1", False),
-        MoleculeDatapoint("test_task", "c1ccccc1", True)
+        MoleculeDatapoint("test_task", "c1ccccc1", True),
     ]
     dataset = MoleculeDataset("test_task", datapoints)
 
@@ -98,22 +95,23 @@ def test_MoleculeDataset_filter():
     assert len(filtered_dataset) == 2
     assert all(dp.bool_label for dp in filtered_dataset)
 
+
 def test_MoleculeDataset_statistics():
     """Test MoleculeDataset statistics."""
     # Create a test dataset
     datapoints = [
         MoleculeDatapoint("test_task", "c1ccccc1", True),
         MoleculeDatapoint("test_task", "c1ccccc1", False),
-        MoleculeDatapoint("test_task", "c1ccccc1", True)
+        MoleculeDatapoint("test_task", "c1ccccc1", True),
     ]
     dataset = MoleculeDataset("test_task", datapoints)
 
     # Get statistics
     stats = dataset.get_statistics()
-    
+
     # Check statistics
     assert stats["size"] == 3
     assert stats["positive_ratio"] == 0.67
     assert isinstance(stats["avg_molecular_weight"], float)
     assert isinstance(stats["avg_atoms"], float)
-    assert isinstance(stats["avg_bonds"], float) 
+    assert isinstance(stats["avg_bonds"], float)