ChEB-AI
diff --git a/‎.github/workflows/python-publish.yml‎
Lines changed: 69 additions & 0 deletions b/‎.github/workflows/python-publish.yml‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 61 additions & 30 deletions b/‎README.md‎
Lines changed: 61 additions & 30 deletions
diff --git a/‎chebifier/cli.py‎
Lines changed: 25 additions & 37 deletions b/‎chebifier/cli.py‎
Lines changed: 25 additions & 37 deletions
@@ -0,0 +1,69 @@
+# This workflow will upload a Python Package to PyPI when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  release-build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Build release distributions
+        run: |
+          python -m pip install build
+          python -m build
+
+      - name: Upload distributions
+        uses: actions/upload-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+  pypi-publish:
+    runs-on: ubuntu-latest
+    needs:
+      - release-build
+    permissions:
+      # IMPORTANT: this permission is mandatory for trusted publishing
+      id-token: write
+
+    # Dedicated environments with protections for publishing are strongly recommended.
+    # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
+    environment:
+      name: pypi
+      # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
+      # url: https://pypi.org/p/YOURPROJECT
+      #
+      # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
+      # ALTERNATIVE: exactly, uncomment the following line instead:
+      # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
+
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist/
@@ -1,5 +1,5 @@
 # python-chebifier
-An AI ensemble model for predicting chemical classes.
+An AI ensemble model for predicting chemical classes in the ChEBI ontology.
 
 ## Installation
 
@@ -12,6 +12,9 @@ cd python-chebifier
 pip install -e .
 ```
 
+Some dependencies of `chebai-graph` cannot be installed automatically. If you want to use Graph Neural Networks, follow
+the instructions in the [chebai-graph repository](https://github.com/ChEB-AI/python-chebai-graph).
+
 ## Usage
 
 ### Command Line Interface
@@ -23,39 +26,18 @@ The package provides a command-line interface (CLI) for making predictions using
 python -m chebifier.cli --help
 
 # Make predictions using a configuration file
-python -m chebifier.cli predict example_config.yml --smiles "CC(=O)OC1=CC=CC=C1C(=O)O" "C1=CC=C(C=C1)C(=O)O"
+python -m chebifier.cli predict configs/example_config.yml --smiles "CC(=O)OC1=CC=CC=C1C(=O)O" "C1=CC=C(C=C1)C(=O)O"
 
 # Make predictions using SMILES from a file
-python -m chebifier.cli predict example_config.yml --smiles-file smiles.txt
+python -m chebifier.cli predict configs/example_config.yml --smiles-file smiles.txt
 ```
 
 ### Configuration File
 
-The CLI requires a YAML configuration file that defines the ensemble model. Here's an example:
-
-```yaml
-# Example configuration file for Chebifier ensemble model
-
-# Each key in the top-level dictionary is a model name
-model1:
-  # Required: type of model (must be one of the keys in MODEL_TYPES)
-  type: electra
-  # Required: name of the model
-  model_name: electra_model1
-  # Required: path to the checkpoint file
-  ckpt_path: /path/to/checkpoint1.ckpt
-  # Required: path to the target labels file
-  target_labels_path: /path/to/target_labels1.txt
-  # Optional: batch size for predictions (default is likely defined in the model)
-  batch_size: 32
-
-model2:
-  type: electra
-  model_name: electra_model2
-  ckpt_path: /path/to/checkpoint2.ckpt
-  target_labels_path: /path/to/target_labels2.txt
-  batch_size: 64
-```
+The CLI requires a YAML configuration file that defines the ensemble model. An example can be found in `configs/example_config.yml`.
+
+The models and other required files are trained / generated by our [chebai](https://github.com/ChEB-AI/python-chebai) package. 
+Examples for models can be found on [kaggle](https://www.kaggle.com/datasets/sfluegel/chebai).
 
 ### Python API
 
@@ -77,10 +59,59 @@ smiles_list = ["CC(=O)OC1=CC=CC=C1C(=O)O", "C1=CC=C(C=C1)C(=O)O"]
 predictions = ensemble.predict_smiles_list(smiles_list)
 
 # Print results
-for smile, prediction in zip(smiles_list, predictions):
-    print(f"SMILES: {smile}")
+for smiles, prediction in zip(smiles_list, predictions):
+    print(f"SMILES: {smiles}")
     if prediction:
         print(f"Predicted classes: {prediction}")
     else:
         print("No predictions")
 ```
+
+### The ensemble
+
+Given a sample (i.e., a SMILES string) and models $m_1, m_2, \ldots, m_n$, the ensemble works as follows:
+1. Get predictions from each model $m_i$ for the sample.
+2. For each class $c$, aggregate predictions $p_c^{m_i}$ from all models that made a prediction for that class. 
+The aggregation happens separately for all positive predictions (i.e., $p_c^{m_i} \geq 0.5$) and all negative predictions
+($p_c^{m_i} < 0.5$). If the aggregated value is larger for the positive predictions than for the negative predictions,
+the ensemble makes a positive prediction for class $c$:
+
+$$
+\text{ensemble}(c) = \begin{cases} 
+1 & \text{if } \sum_{i: p_c^{m_i} \geq 0.5} [\text{confidence}_c^{m_i} \cdot \text{model_weight}_{m_i} \cdot \text{trust}_c^{m_i}] > \sum_{i: p_c^{m_i} < 0.5} [\text{confidence}_c^{m_i} \cdot \text{model_weight}_{m_i} \cdot \text{trust}_c^{m_i}] \\
+0 & \text{otherwise}
+\end{cases}
+$$
+
+Here, confidence is the model's (self-reported) confidence in its prediction, calculated as
+$$
+\text{confidence}_c^{m_i} = 2|p_c^{m_i} - 0.5|
+$$
+For example, if a model makes a positive prediction with $p_c^{m_i} = 0.55$, the confidence is $2|0.55 - 0.5| = 0.1$.
+One could say that the model is not very confident in its prediction and very close to switching to a negative prediction.
+If another model is very sure about its negative prediction with $p_c^{m_j} = 0.1$, the confidence is $2|0.1 - 0.5| = 0.8$.
+Therefore, if in doubt, we are more confident in the negative prediction.
+
+Confidence can be disabled by the `use_confidence` parameter of the predict method (default: True).
+
+The model_weight can be set for each model in the configuration file (default: 1). This is used to favor a certain 
+model independently of a given class. 
+Trust is based on the model's performance on a validation set. After training, we evaluate the Machine Learning models 
+on a validation set for each class. If the `ensemble_type` is set to `wmv-f1`, the trust is calculated as 1 + the F1 score.
+If the `ensemble_type` is set to `mv` (the default), the trust is set to 1 for all models.
+
+3. After a decision has been made for each class independently, the consistency of the predictions with regard to the ChEBI hierarchy 
+and disjointness axioms is checked. This is
+done in 3 steps:
+- (1) First, the hierarchy is corrected. For each pair of classes $A$ and $B$ where $A$ is a subclass of $B$ (following 
+the is-a relation in ChEBI), we set the ensemble prediction of $B$ to 1 if the prediction of $A$ is 1. Intuitively 
+speaking, if we have determined that a molecule belongs to a specific class (e.g., aromatic primary alcohol), it also
+belongs to the direct and indirect superclasses (e.g., primary alcohol, aromatic alcohol, alcohol).
+- (2) Next, we check for disjointness. This is not specified directly in ChEBI, but in an additional ChEBI module ([chebi-disjoints.owl](https://ftp.ebi.ac.uk/pub/databases/chebi/ontology/)).
+We have extracted these disjointness axioms into a CSV file and added some more disjointness axioms ourselves (see
+`data>disjoint_chebi.csv` and `data>disjoint_additional.csv`). If two classes $A$ and $B$ are disjoint and we predict
+both, we select one of them randomly and set the other to 0.
+- (3) Since the second step might have introduced new inconsistencies into the hierarchy, we repeat the first step, but 
+with a small change. For a pair of classes $A \subseteq B$ with predictions $1$ and $0$, instead of setting $B$ to $1$,
+we now set $A$ to $0$. This has the advantage that we cannot introduce new disjointness-inconsistencies and don't have
+to repeat step 2.
@@ -2,54 +2,47 @@
 import yaml
 
 from .model_registry import ENSEMBLES
+from chebifier.ensemble.base_ensemble import BaseEnsemble
+from chebifier.ensemble.weighted_majority_ensemble import WMVwithPPVNPVEnsemble, WMVwithF1Ensemble
 
 
 @click.group()
 def cli():
     """Command line interface for Chebifier."""
     pass
 
+ENSEMBLES = {
+    "mv": BaseEnsemble,
+    "wmv-ppvnpv": WMVwithPPVNPVEnsemble,
+    "wmv-f1": WMVwithF1Ensemble
+}
 
 @cli.command()
-@click.argument("config_file", type=click.Path(exists=True))
-@click.option("--smiles", "-s", multiple=True, help="SMILES strings to predict")
-@click.option(
-    "--smiles-file",
-    "-f",
-    type=click.Path(exists=True),
-    help="File containing SMILES strings (one per line)",
-)
-@click.option(
-    "--output",
-    "-o",
-    type=click.Path(),
-    help="Output file to save predictions (optional)",
-)
-@click.option(
-    "--ensemble-type",
-    "-e",
-    type=click.Choice(ENSEMBLES.keys()),
-    default="mv",
-    help="Type of ensemble to use (default: Majority Voting)",
-)
-def predict(config_file, smiles, smiles_file, output, ensemble_type):
+@click.argument('config_file', type=click.Path(exists=True))
+@click.option('--smiles', '-s', multiple=True, help='SMILES strings to predict')
+@click.option('--smiles-file', '-f', type=click.Path(exists=True), help='File containing SMILES strings (one per line)')
+@click.option('--output', '-o', type=click.Path(), help='Output file to save predictions (optional)')
+@click.option('--ensemble-type', '-e', type=click.Choice(ENSEMBLES.keys()), default='mv', help='Type of ensemble to use (default: Majority Voting)')
+@click.option("--chebi-version", "-v", type=int, default=241, help="ChEBI version to use for checking consistency (default: 241)")
+@click.option("--use-confidence", "-c", is_flag=True, default=True, help="Weight predictions based on how 'confident' a model is in its prediction (default: True)")
+def predict(config_file, smiles, smiles_file, output, ensemble_type, chebi_version):
     """Predict ChEBI classes for SMILES strings using an ensemble model.
-
+    
     CONFIG_FILE is the path to a YAML configuration file for the ensemble model.
     """
     # Load configuration from YAML file
-    with open(config_file, "r") as f:
+    with open(config_file, 'r') as f:
         config = yaml.safe_load(f)
-
+    
     # Instantiate ensemble model
-    ensemble = ENSEMBLES[ensemble_type](config)
-
+    ensemble = ENSEMBLES[ensemble_type](config, chebi_version=chebi_version)
+    
     # Collect SMILES strings from arguments and/or file
     smiles_list = list(smiles)
     if smiles_file:
-        with open(smiles_file, "r") as f:
+        with open(smiles_file, 'r') as f:
             smiles_list.extend([line.strip() for line in f if line.strip()])
-
+    
     if not smiles_list:
         click.echo("No SMILES strings provided. Use --smiles or --smiles-file options.")
         return
@@ -60,13 +53,8 @@ def predict(config_file, smiles, smiles_file, output, ensemble_type):
     if output:
         # save as json
         import json
-
-        with open(output, "w") as f:
-            json.dump(
-                {smiles: pred for smiles, pred in zip(smiles_list, predictions)},
-                f,
-                indent=2,
-            )
+        with open(output, 'w') as f:
+            json.dump({smiles: pred for smiles, pred in zip(smiles_list, predictions)}, f, indent=2)
 
     else:
         # Print results
@@ -78,5 +66,5 @@ def predict(config_file, smiles, smiles_file, output, ensemble_type):
                 click.echo("  No predictions")
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     cli()