ChEB-AI
diff --git a/‎.github/workflows/black.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/black.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 21 additions & 5 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 21 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 23 additions & 6 deletions b/‎README.md‎
Lines changed: 23 additions & 6 deletions
diff --git a/‎chebai/callbacks.py‎
Lines changed: 1 addition & 1 deletion b/‎chebai/callbacks.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎chebai/callbacks/prediction_callback.py‎
Lines changed: 1 addition & 0 deletions b/‎chebai/callbacks/prediction_callback.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎chebai/loggers/custom.py‎
Lines changed: 3 additions & 3 deletions b/‎chebai/loggers/custom.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎chebai/loss/bce_weighted.py‎
Lines changed: 1 addition & 0 deletions b/‎chebai/loss/bce_weighted.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎chebai/loss/semantic.py‎
Lines changed: 4 additions & 3 deletions b/‎chebai/loss/semantic.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎chebai/models/base.py‎
Lines changed: 4 additions & 1 deletion b/‎chebai/models/base.py‎
Lines changed: 4 additions & 1 deletion
@@ -7,4 +7,4 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - uses: psf/black@stable
+      - uses: psf/black@stable
@@ -161,8 +161,9 @@ cython_debug/
 #.idea/
 
 # configs/ # commented as new configs can be added as a part of a feature
+
+/.idea
 /data
 /logs
 /results_buffer
 electra_pretrained.ckpt
-/.idea
 
@@ -1,9 +1,25 @@
 repos:
-#-   repo: https://github.com/PyCQA/isort
-#    rev: "5.12.0"
-#    hooks:
-#    -   id: isort
 -   repo: https://github.com/psf/black
     rev: "24.2.0"
     hooks:
-    -   id: black
+    -   id: black
+    -   id: black-jupyter # for formatting jupyter-notebook
+
+-   repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        name: isort (python)
+        args: ["--profile=black"]
+
+-   repo: https://github.com/asottile/seed-isort-config
+    rev: v2.2.0
+    hooks:
+    -   id: seed-isort-config
+
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
@@ -1,8 +1,25 @@
 # ChEBai
 
-ChEBai is a deep learning library designed for the integration of deep learning methods with chemical ontologies, particularly ChEBI. 
+ChEBai is a deep learning library designed for the integration of deep learning methods with chemical ontologies, particularly ChEBI.
 The library emphasizes the incorporation of the semantic qualities of the ontology into the learning process.
 
+## Note for developers
+
+If you have used ChEBai before PR #39, the file structure in which your ChEBI-data is saved has changed. This means that
+datasets will be freshly generated. The data however is the same. If you want to keep the old data (including the old
+splits), you can use a migration script. It copies the old data to the new location for a specific ChEBI class
+(including chebi version and other parameters). The script can be called by specifying the data module from a config
+```
+python chebai/preprocessing/migration/chebi_data_migration.py migrate --datamodule=[path-to-data-config]
+```
+or by specifying the class name (e.g. `ChEBIOver50`) and arguments separately
+```
+python chebai/preprocessing/migration/chebi_data_migration.py migrate --class_name=[data-class] [--chebi_version=[version]]
+```
+The new dataset will by default generate random data splits (with a given seed).
+To reuse a fixed data split, you have to provide the path of the csv file generated during the migration:
+`--data.init_args.splits_file_path=[path-to-processed_data]/splits.csv`
+
 ## Installation
 
 To install ChEBai, follow these steps:
@@ -21,7 +38,7 @@ pip install .
 
 ## Usage
 
-The training and inference is abstracted using the Pytorch Lightning modules. 
+The training and inference is abstracted using the Pytorch Lightning modules.
 Here are some CLI commands for the standard functionalities of pretraining, ontology extension, fine-tuning for toxicity and prediction.
 For further details, see the [wiki](https://github.com/ChEB-AI/python-chebai/wiki).
 If you face any problems, please open a new [issue](https://github.com/ChEB-AI/python-chebai/issues/new).
@@ -55,18 +72,18 @@ The `classes_path` is the path to the dataset's `raw/classes.txt` file that cont
 
 ## Evaluation
 
-An example for evaluating a model trained on the ontology extension task is given in `tutorials/eval_model_basic.ipynb`. 
+An example for evaluating a model trained on the ontology extension task is given in `tutorials/eval_model_basic.ipynb`.
 It takes in the finetuned model as input for performing the evaluation.
 
 ## Cross-validation
-You can do inner k-fold cross-validation, i.e., train models on k train-validation splits that all use the same test 
+You can do inner k-fold cross-validation, i.e., train models on k train-validation splits that all use the same test
 set. For that, you need to specify the total_number of folds as
 ```
 --data.init_args.inner_k_folds=K
 ```
 and the fold to be used in the current optimisation run as
-``` 
+```
 --data.init_args.fold_index=I
 ```
-To train K models, you need to do K such calls, each with a different `fold_index`. On the first call with a given 
+To train K models, you need to do K such calls, each with a different `fold_index`. On the first call with a given
 `inner_k_folds`, all folds will be created and stored in the data directory
@@ -1,9 +1,9 @@
 import json
 import os
 
-from lightning.pytorch.callbacks import BasePredictionWriter
 import torch
 from typing import Any, Dict, List, Union, Literal
+from lightning.pytorch.callbacks import BasePredictionWriter
 
 
 class ChebaiPredictionWriter(BasePredictionWriter):
 
@@ -1,6 +1,7 @@
 from lightning.pytorch import Trainer, LightningModule
 from lightning.pytorch.callbacks import BasePredictionWriter
 import torch
+
 import os
 import pickle
 from typing import Sequence, Any, Literal
 
@@ -1,11 +1,11 @@
-from datetime import datetime
-from typing import Literal, Optional, Union, List
 import os
+from datetime import datetime
+from typing import List, Literal, Optional, Union
 
+import wandb
 from lightning.fabric.utilities.types import _PATH
 from lightning.pytorch.callbacks import ModelCheckpoint
 from lightning.pytorch.loggers import WandbLogger
-import wandb
 
 
 class CustomLogger(WandbLogger):
 
@@ -1,6 +1,7 @@
 from typing import Optional
 
 import torch
+
 from chebai.preprocessing.datasets.base import XYBaseDataModule
 from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
 import pandas as pd
 
@@ -1,14 +1,15 @@
 import csv
+import math
 import os
 import pickle
 
-import math
 import torch
+
 from typing import Literal, Union, List
 
-from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor, ChEBIOver100
-from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
 from chebai.loss.bce_weighted import BCEWeighted
+from chebai.preprocessing.datasets.chebi import ChEBIOver100, _ChEBIDataExtractor
+from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
 
 
 class ImplicationLoss(torch.nn.Module):
 
@@ -1,9 +1,12 @@
 from typing import Optional, Dict, Union, Any
 import logging
 
-from lightning.pytorch.core.module import LightningModule
 import torch
+
 from torchmetrics import Metric
+
+from lightning.pytorch.core.module import LightningModule
+
 from chebai.preprocessing.structures import XYData
 
 logging.getLogger("pysmiles").setLevel(logging.CRITICAL)