Merge pull request #37 from sciknoworg/dev

HamedBabaei · web-flow · commit ff2e81078d94 · 2025-05-26T12:23:40.000+01:00
#25 dropping dependency on setfit, ontospy and requirements update
diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml
@@ -12,7 +12,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.10.x, 3.11.x]
+        python-version: [3.10.x, 3.11.x, 3.12.x, 3.13.x]
 
     steps:
       - name: Checkout repository
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,12 @@
 ## Changelog
 
-### V1.4.0 Changelog (May 2025)
+### V1.4.1 Changelog (May 26, 2025)
+- Fixed an issue related to #25 and #36.
+- Sentence-transformer v4.1.0 is supported.
+- Adding Python 3.12 and 3.13 for automated testing.
+- Remove the dependency with ontospy since it is not being maintained. Initially it was used for `MaterialInformationOntoOntology` class.
+
+### V1.4.0 Changelog (May 22, 2025)
 - Fixed a security vulnerability by updating the Torch and Transformers dependency version.
 - Integrated pytest into the pyproject.toml to enable testing support.
 - Resolved Python version compatibility issues in the continuous integration (CI) pipeline for stable test runs.
diff --git a/CITATION.cff b/CITATION.cff
@@ -17,5 +17,5 @@ keywords:
   - "Alignment"
   - "Python Library"
 license: "Apache-2.0"
-version: "1.4.0"
-date-released: "2025-05-22"
+version: "1.4.1"
+date-released: "2025-05-26"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -46,7 +46,7 @@ cd OntoAligner
 ```
 3. Create a virtual environment with python=3.10, activate it, install the required dependencies and install the pre-commit configuration:
 ```bash
-conda create -n my_env python=3.9
+conda create -n my_env python=3.10
 conda activate my_env
 pip install -r requirements.txt
 pre-commit install
diff --git a/ontoaligner/__init__.py b/ontoaligner/__init__.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-__version__ = "1.4.0"
+__version__ = "1.4.1"
 
 from .pipeline import OntoAlignerPipeline
 from ontoaligner import ontology, base, encoder, aligner, utils, postprocess
diff --git a/ontoaligner/ontology/oaei/mse.py b/ontoaligner/ontology/oaei/mse.py
@@ -9,9 +9,8 @@
 import re
 from typing import Any, List
 
-import ontospy
-
 from ...base import BaseOntologyParser, OMDataset
+from ..generic import GenericOntology
 
 track = "mse"
 
@@ -120,137 +119,14 @@ def get_synonyms(self, owl_class: Any) -> List:
         return []
 
 
-class MaterialInformationOntoOntology(BaseOntologyParser):
+class MaterialInformationOntoOntology(GenericOntology):
     """
     A parser for the Material Information Ontology.
 
     This class provides methods for handling ontology items such as labels, names, IRIs,
     parents, children, and more. It also provides functionality to load the ontology from a file.
     """
-    def is_contain_label(self, owl_class: Any) -> bool:
-        """
-        Checks if the ontology class has a label.
-
-        Parameters:
-            owl_class (Any): The ontology class whose label presence is to be checked.
-
-        Returns:
-            bool: Always returns True as all classes are assumed to have labels.
-        """
-        return True
-
-    def get_name(self, owl_class: Any) -> str:
-        """
-        Retrieves the name of the ontology class.
-
-        Parameters:
-            owl_class (Any): The ontology class whose name is to be retrieved.
-
-        Returns:
-            str: The name of the ontology class.
-        """
-        return str(owl_class.uri).split("#")[1]
-
-    def get_label(self, owl_class: Any) -> str:
-        """
-        Retrieves and formats the label of the ontology class.
-
-        Parameters:
-            owl_class (Any): The ontology class whose label is to be retrieved.
-
-        Returns:
-            str: The formatted label of the ontology class.
-        """
-        preprocessed_str = (
-            self.get_iri(owl_class).split("#")[1].replace("_", " ").replace("-", "")
-        )
-        return split_string(preprocessed_str)
-
-    def get_iri(self, owl_class: Any) -> str:
-        """
-        Retrieves the IRI of the ontology class.
-
-        Parameters:
-            owl_class (Any): The ontology class whose IRI is to be retrieved.
-
-        Returns:
-            str: The IRI of the ontology class.
-        """
-        return str(owl_class.uri)
-
-    def get_childrens(self, owl_class: Any) -> List:
-        """
-        Retrieves the children of the ontology class.
-
-        Parameters:
-            owl_class (Any): The ontology class whose children are to be retrieved.
-
-        Returns:
-            List: A list of child classes for the given ontology class.
-        """
-        return self.get_owl_items(owl_class.children())
-
-    def get_parents(self, owl_class: Any) -> List:
-        """
-        Retrieves the parents of the ontology class.
-
-        Parameters:
-            owl_class (Any): The ontology class whose parents are to be retrieved.
-
-        Returns:
-            List: A list of parent classes for the given ontology class.
-        """
-        return self.get_owl_items(owl_class.parents())
-
-    def get_synonyms(self, owl_class: Any) -> List:
-        """
-        Retrieves synonyms for the ontology class.
-
-        Parameters:
-            owl_class (Any): The ontology class whose synonyms are to be retrieved.
-
-        Returns:
-            List: An empty list as no synonyms are implemented for this ontology class.
-        """
-        return []
-
-    def get_comments(self, owl_class: Any) -> List:
-        """
-        Retrieves comments for the ontology class.
-
-        Parameters:
-            owl_class (Any): The ontology class whose comments are to be retrieved.
-
-        Returns:
-            List: An empty list as no comments are implemented for this ontology class.
-        """
-        return []
-
-    def get_owl_classes(self, ontology: Any) -> Any:
-        """
-        Retrieves all classes from the ontology.
-
-        Parameters:
-            ontology (Any): The ontology whose classes are to be retrieved.
-
-        Returns:
-            Any: The classes of the ontology.
-        """
-        return ontology.all_classes
-
-    def load_ontology(self, input_file_path: str) -> Any:
-        """
-        Loads an ontology from the specified file.
-
-        Parameters:
-            input_file_path (str): The path to the ontology file to be loaded.
-
-        Returns:
-            Any: The loaded ontology.
-        """
-        ontology = ontospy.Ontospy(input_file_path, verbose=False)
-        return ontology
-
+    pass
 
 class MatOntoOntology(BaseOntologyParser):
     """
diff --git a/ontoaligner/postprocess/label_mapper.py b/ontoaligner/postprocess/label_mapper.py
@@ -4,11 +4,12 @@
 - `TFIDFLabelMapper`: Uses a TfidfVectorizer and a classifier for label prediction.
 - `SetFitShallowLabelMapper`: Uses a pretrained SetFit model for label prediction.
 """
-
 from typing import Dict, List, Tuple, Any
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.pipeline import Pipeline
-from setfit import SetFitModel
+from sentence_transformers import SentenceTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import LabelEncoder
 
 
 class LabelMapper:
@@ -110,34 +111,50 @@ def _predict(self, X: List[str]) -> List[str]:
         return self.model.predict(X)
 
 
-class SetFitShallowLabelMapper(LabelMapper):
+class SBERTLabelMapper(LabelMapper):
     """
-    LabelMapper subclass using a pretrained SetFit model for label prediction.
+    LabelMapper subclass using SentenceTransformer embeddings and a classifier for label prediction.
+
+    Example usage:
+    >>> label_dict = {
+            "yes":["yes", "correct", "true"],
+            "no":["no", "incorrect", "false"]
+        }
+    >>> mapper = SBERTLabelMapper("all-MiniLM-L12-v2", label_dict)
+    >>> mapper.fit()
+    >>> mapper.predict(["yes", "correct", "false", "nice", "too bad", "very good"])
+    ['yes', 'yes', 'no', 'yes', 'no', 'yes']
     """
-    def __init__(self, model_id: str, label_dict: Dict[str, List[str]], iterator_no: int = 10):
+    def __init__(self, model_id: str, label_dict: Dict[str, List[str]], classifier=None, iterator_no: int = 10):
         """
-        Initializes the SetFitShallowLabelMapper with a specified SetFit model.
+        Initializes the SBERTLabelMapper.
 
         Parameters:
-            model_id (str): Identifier for the pretrained SetFit model.
+            model_id (str): Name of the pretrained SentenceTransformer model.
             label_dict (Dict[str, List[str]]): Dictionary mapping each label to a list of candidate phrases.
             iterator_no (int): Number of iterations to replicate training data.
         """
         super().__init__(label_dict, iterator_no)
-        self.model = SetFitModel.from_pretrained(model_id)
+        self.embedder = SentenceTransformer(model_id)
+        self.classifier = classifier or LogisticRegression()
+        self.label_encoder = LabelEncoder()
 
     def fit(self):
-        """Fits the SetFit model on the training data."""
-        self.model.fit(self.x_train, self.y_train, num_epochs=10)
+        """Fits the classifier on the sentence embeddings."""
+        embeddings = self.embedder.encode(self.x_train, convert_to_numpy=True)
+        y_encoded = self.label_encoder.fit_transform(self.y_train)
+        self.classifier.fit(embeddings, y_encoded)
 
     def _predict(self, X: List[str]) -> List[str]:
         """
-        Predicts labels for the given input using the SetFit model.
+        Predicts labels using the sentence transformer + classifier pipeline
 
         Parameters:
             X (List[str]): List of input texts to classify.
 
         Returns:
             List[str]: Predicted labels.
         """
-        return self.model.predict(X)
+        embeddings = self.embedder.encode(X, convert_to_numpy=True)
+        y_pred_encoded = self.classifier.predict(embeddings)
+        return [str(pred) for pred in self.label_encoder.inverse_transform(y_pred_encoded)]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "OntoAligner"
-version = "1.4.0"
+version = "1.4.1"
 description = "OntoAligner: A Comprehensive Modular and Robust Python Toolkit for Ontology Alignment."
 authors = ["Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>"]
 license="Apache-2.0"
@@ -17,18 +17,16 @@ numpy = "*"
 pandas = "*"
 datasets = "*"
 scikit-learn = "*"
-tqdm = "4.66.3"
+tqdm = "*"
 owlready2 = "0.44"
 rdflib = "7.1.1"
-ontospy = "2.1.1"
 torch = "2.7.0"
 transformers = "4.50.0"
 rapidfuzz = "3.5.2"
 openai = "1.56.0"
 rank_bm25 = "0.2.2"
 huggingface_hub="0.28.1"
-sentence-transformers = "3.4.1"
-setfit = "1.1.1"
+sentence-transformers = "4.1.0"
 bitsandbytes="0.45.1"
 
 [tool.poetry.dev-dependencies]
diff --git a/requirements.txt b/requirements.txt
@@ -4,18 +4,16 @@ numpy
 pandas
 datasets
 scikit_learn
-ontospy==2.1.1
 openai==1.56.0
 owlready2==0.44
 rank_bm25==0.2.2
 rapidfuzz==3.5.2
 rdflib==7.1.1
-sentence_transformers==3.4.1
+sentence_transformers==4.1.0
 torch==2.7.0
-tqdm==4.66.3
+tqdm
 transformers==4.50.0
 huggingface_hub==0.28.1
-setfit==1.1.1
 bitsandbytes==0.45.1
 pre-commit
 setuptools
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="OntoAligner",
-    version="1.4.0",
+    version="1.4.1",
     author="Hamed Babaei Giglou",
     author_email="hamedbabaeigiglou@gmail.com",
     description="OntoAligner: A Comprehensive Modular and Robust Python Toolkit for Ontology Alignment",
@@ -20,18 +20,16 @@
         "numpy",
         "pandas",
         "scikit-learn",
-        "ontospy==2.1.1",
+        "tqdm",
         "openai==1.56.0",
         "owlready2==0.44",
         "rank_bm25==0.2.2",
         "rapidfuzz==3.5.2",
         "rdflib==7.1.1",
-        "sentence-transformers==3.4.1",
+        "sentence-transformers==4.1.0",
         "torch==2.7.0",
-        "tqdm==4.66.3",
         "transformers==4.50.0",
         "huggingface_hub==0.28.1",
-        "setfit==1.1.1",
         "bitsandbytes==0.45.1",
     ],
     classifiers=[
diff --git a/tests/test_ontology.py b/tests/test_ontology.py
@@ -1,13 +1,13 @@
 import unittest
 import os
-from ontoaligner.ontology import GenericOntology
+import ontoaligner
 from rdflib import URIRef, RDFS
 
 class TestOntology(unittest.TestCase):
 
     def test_generic_ontology_parser(self):
         """Test that the parse function loads an ontology correctly."""
-        ontology = GenericOntology()
+        ontology = ontoaligner.ontology.GenericOntology()
         ontology_path = os.path.join(os.path.dirname(__file__), "data/test-case1.owl")
         data = ontology.parse(ontology_path)