diff --git a/.gitignore b/.gitignore index db3c0f3..a9c35b3 100644 --- a/.gitignore +++ b/.gitignore @@ -159,6 +159,9 @@ dmypy.json # Cython debug symbols cython_debug/ +# Pytest cache +test_output*/ + # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore diff --git a/ontoaligner/__init__.py b/ontoaligner/__init__.py index f5fc60e..bc7d9bb 100644 --- a/ontoaligner/__init__.py +++ b/ontoaligner/__init__.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. __version__ = "1.4.1" from .pipeline import OntoAlignerPipeline diff --git a/ontoaligner/aligner/__init__.py b/ontoaligner/aligner/__init__.py index 9b44eac..03b4e35 100644 --- a/ontoaligner/aligner/__init__.py +++ b/ontoaligner/aligner/__init__.py @@ -1,3 +1,16 @@ +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .fewshot.models import * # NOQA from .fewshot.dataset import * # NOQA from .icv.models import * # NOQA diff --git a/ontoaligner/aligner/fewshot/__init__.py b/ontoaligner/aligner/fewshot/__init__.py index 87d3afd..5f7b004 100644 --- a/ontoaligner/aligner/fewshot/__init__.py +++ b/ontoaligner/aligner/fewshot/__init__.py @@ -1,2 +1,14 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .fewshot import * # NOQA diff --git a/ontoaligner/aligner/fewshot/dataset.py b/ontoaligner/aligner/fewshot/dataset.py index e7345c5..31da2cf 100644 --- a/ontoaligner/aligner/fewshot/dataset.py +++ b/ontoaligner/aligner/fewshot/dataset.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines dataset classes for few-shot learning tasks, particularly for concept comparison tasks. These classes inherit from the RAGDataset class and extend its functionality to handle few-shot learning diff --git a/ontoaligner/aligner/fewshot/fewshot.py b/ontoaligner/aligner/fewshot/fewshot.py index 52d2561..b7f88f2 100644 --- a/ontoaligner/aligner/fewshot/fewshot.py +++ b/ontoaligner/aligner/fewshot/fewshot.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines the FewShotRAG class, an extension of the RAG model, designed for few-shot learning tasks. The FewShotRAG class uses retrieval-augmented generation techniques, combining information retrieval and diff --git a/ontoaligner/aligner/fewshot/models.py b/ontoaligner/aligner/fewshot/models.py index 0441e59..52e0e79 100644 --- a/ontoaligner/aligner/fewshot/models.py +++ b/ontoaligner/aligner/fewshot/models.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines a collection of classes that extend the FewShotRAG model, each combining a specific retrieval model and language model (LLM) configuration. These specialized configurations are tailored diff --git a/ontoaligner/aligner/icv/__init__.py b/ontoaligner/aligner/icv/__init__.py index 2727ca6..53bd511 100644 --- a/ontoaligner/aligner/icv/__init__.py +++ b/ontoaligner/aligner/icv/__init__.py @@ -1,2 +1,14 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .icv import * # NOQA diff --git a/ontoaligner/aligner/icv/icv.py b/ontoaligner/aligner/icv/icv.py index 7263d9f..1972bff 100644 --- a/ontoaligner/aligner/icv/icv.py +++ b/ontoaligner/aligner/icv/icv.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Script for implementing ICV-based ontology matching using RAG and LLM architectures. diff --git a/ontoaligner/aligner/icv/models.py b/ontoaligner/aligner/icv/models.py index e57f611..0a082d1 100644 --- a/ontoaligner/aligner/icv/models.py +++ b/ontoaligner/aligner/icv/models.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Script for integrating ICV-based language models with various retrieval mechanisms. diff --git a/ontoaligner/aligner/lightweight/__init__.py b/ontoaligner/aligner/lightweight/__init__.py index c542d79..5136729 100644 --- a/ontoaligner/aligner/lightweight/__init__.py +++ b/ontoaligner/aligner/lightweight/__init__.py @@ -1,2 +1,14 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .lightweight import * # NOQA diff --git a/ontoaligner/aligner/lightweight/lightweight.py b/ontoaligner/aligner/lightweight/lightweight.py index 811fb98..3f8b2cf 100644 --- a/ontoaligner/aligner/lightweight/lightweight.py +++ b/ontoaligner/aligner/lightweight/lightweight.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines models for ontology matching, specifically a lightweight model and an extension that uses fuzzy string matching via the RapidFuzz library. @@ -108,7 +120,7 @@ def calculate_similarity(self, source: str, candidates: List) -> [int, float]: Returns: List: A list containing the index of the most similar candidate and the normalized similarity score. """ - selected_candid = rapidfuzz.process_cpp.extractOne( + selected_candid = rapidfuzz.process.extractOne( source, candidates, scorer=self.ratio_estimate(), diff --git a/ontoaligner/aligner/lightweight/models.py b/ontoaligner/aligner/lightweight/models.py index b966811..f7260ec 100644 --- a/ontoaligner/aligner/lightweight/models.py +++ b/ontoaligner/aligner/lightweight/models.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines different variants of the `FuzzySMLightweight` class, each implementing a different string similarity ratio estimation method using the RapidFuzz library. diff --git a/ontoaligner/aligner/llm/__init__.py b/ontoaligner/aligner/llm/__init__.py index 46746d1..e19c6c0 100644 --- a/ontoaligner/aligner/llm/__init__.py +++ b/ontoaligner/aligner/llm/__init__.py @@ -1,2 +1,14 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .llm import * # NOQA diff --git a/ontoaligner/aligner/llm/dataset.py b/ontoaligner/aligner/llm/dataset.py index 674b37b..38d8abb 100644 --- a/ontoaligner/aligner/llm/dataset.py +++ b/ontoaligner/aligner/llm/dataset.py @@ -1,5 +1,16 @@ -# -*- coding: utf-8 -*- - +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from typing import Any, Dict from torch.utils.data import Dataset diff --git a/ontoaligner/aligner/llm/llm.py b/ontoaligner/aligner/llm/llm.py index b0be4f3..933ef3f 100644 --- a/ontoaligner/aligner/llm/llm.py +++ b/ontoaligner/aligner/llm/llm.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines several classes for interacting with large language models (LLMs) through various architectures, such as a generic LLM class, OpenAI-based LLMs, and encoder-decoder diff --git a/ontoaligner/aligner/llm/models.py b/ontoaligner/aligner/llm/models.py index 49d2ab3..f62d766 100644 --- a/ontoaligner/aligner/llm/models.py +++ b/ontoaligner/aligner/llm/models.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines various subclasses for different types of language models (LMs), including encoder-decoder models, decoder-only models, and models interfacing with OpenAI's GPT. These classes inherit from diff --git a/ontoaligner/aligner/rag/__init__.py b/ontoaligner/aligner/rag/__init__.py index 81ad3bc..89ccf92 100644 --- a/ontoaligner/aligner/rag/__init__.py +++ b/ontoaligner/aligner/rag/__init__.py @@ -1,2 +1,14 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .rag import * # NOQA diff --git a/ontoaligner/aligner/rag/dataset.py b/ontoaligner/aligner/rag/dataset.py index b4523ff..135a745 100644 --- a/ontoaligner/aligner/rag/dataset.py +++ b/ontoaligner/aligner/rag/dataset.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines a set of custom dataset classes for handling various types of data used in a real-world entity classification task. These datasets preprocess and format the input data to create structured prompts for a classification model, with variations diff --git a/ontoaligner/aligner/rag/models.py b/ontoaligner/aligner/rag/models.py index 9a4247b..4ebf098 100644 --- a/ontoaligner/aligner/rag/models.py +++ b/ontoaligner/aligner/rag/models.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines a series of Retrieval-Augmented Generation (RAG) classes that combine different retrieval models and language models (LLMs). Each class specializes in pairing a specific retrieval model (e.g., AdaRetrieval, BERTRetrieval) diff --git a/ontoaligner/aligner/rag/rag.py b/ontoaligner/aligner/rag/rag.py index ae74745..47579ee 100644 --- a/ontoaligner/aligner/rag/rag.py +++ b/ontoaligner/aligner/rag/rag.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines several classes that implement retrieval-augmented generation (RAG) architectures for natural language generation tasks. The architecture integrates retrieval models (such as AdaRetrieval and BERTRetrieval) and language models (such as AutoModelForCausalLM and OpenAI) diff --git a/ontoaligner/aligner/retrieval/__init__.py b/ontoaligner/aligner/retrieval/__init__.py index aad761e..d675f15 100644 --- a/ontoaligner/aligner/retrieval/__init__.py +++ b/ontoaligner/aligner/retrieval/__init__.py @@ -1,2 +1,14 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .retrieval import * # NOQA diff --git a/ontoaligner/aligner/retrieval/models.py b/ontoaligner/aligner/retrieval/models.py index e2b8cb2..5e4e8c9 100644 --- a/ontoaligner/aligner/retrieval/models.py +++ b/ontoaligner/aligner/retrieval/models.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines various retrieval models used for information retrieval tasks. It includes both traditional methods (such as TF-IDF and BM25) as well as more modern diff --git a/ontoaligner/aligner/retrieval/retrieval.py b/ontoaligner/aligner/retrieval/retrieval.py index ef8ef0f..dba9e38 100644 --- a/ontoaligner/aligner/retrieval/retrieval.py +++ b/ontoaligner/aligner/retrieval/retrieval.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines different retrieval models for matching source and target ontologies. It provides several classes that implement different retrieval techniques, such as Bi-Encoder Retrieval, diff --git a/ontoaligner/base/__init__.py b/ontoaligner/base/__init__.py index 33d4f90..496a337 100644 --- a/ontoaligner/base/__init__.py +++ b/ontoaligner/base/__init__.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .dataset import * # NOQA from .encoder import * # NOQA from .model import * # NOQA diff --git a/ontoaligner/base/dataset.py b/ontoaligner/base/dataset.py index 1310e0e..df27275 100644 --- a/ontoaligner/base/dataset.py +++ b/ontoaligner/base/dataset.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ The script is responsible for loading and collecting data related to source and target ontologies, along with reference alignments. It provides methods for collecting data, loading from JSON, and handling file paths. diff --git a/ontoaligner/base/encoder.py b/ontoaligner/base/encoder.py index 1a3776b..7319d00 100644 --- a/ontoaligner/base/encoder.py +++ b/ontoaligner/base/encoder.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script provides a foundation for flexible text encoding, including text preprocessing, customizable prompt templates, and structured methods for encoding and retrieving encoder-specific details. diff --git a/ontoaligner/base/model.py b/ontoaligner/base/model.py index b5df623..d58afd0 100644 --- a/ontoaligner/base/model.py +++ b/ontoaligner/base/model.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Defines a blueprint for ontology matching models, specifying methods for string representation and data generation that must be implemented by subclasses. The script ensures consistency and structure for building specialized models in the ontology matching domain. diff --git a/ontoaligner/base/ontology.py b/ontoaligner/base/ontology.py index e106166..cfbf3aa 100644 --- a/ontoaligner/base/ontology.py +++ b/ontoaligner/base/ontology.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script provides functionality for parsing ontologies and alignment files. It includes methods for extracting data from OWL ontologies, such as names, labels, and relationships, diff --git a/ontoaligner/encoder/__init__.py b/ontoaligner/encoder/__init__.py index 9f29cae..00c6c09 100644 --- a/ontoaligner/encoder/__init__.py +++ b/ontoaligner/encoder/__init__.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .fewshot import * # NOQA from .lightweight import * # NOQA from .llm import * # NOQA diff --git a/ontoaligner/encoder/encoders.py b/ontoaligner/encoder/encoders.py index 23d2bae..a187d87 100644 --- a/ontoaligner/encoder/encoders.py +++ b/ontoaligner/encoder/encoders.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines three encoder classes that extend the BaseEncoder class. Each class is designed to parse ontological data in different ways, with different encoding strategies, diff --git a/ontoaligner/encoder/fewshot.py b/ontoaligner/encoder/fewshot.py index 2306b96..5c3b136 100644 --- a/ontoaligner/encoder/fewshot.py +++ b/ontoaligner/encoder/fewshot.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines three encoder classes for few-shot learning based on the RAG (retrieval-augmented generation) method. These classes extend the functionality of the RAG-based encoders for concept, concept children, and concept parent, diff --git a/ontoaligner/encoder/lightweight.py b/ontoaligner/encoder/lightweight.py index 31eccd3..714f617 100644 --- a/ontoaligner/encoder/lightweight.py +++ b/ontoaligner/encoder/lightweight.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines three encoder classes that inherit from the LightweightEncoder class. These encoders are used to process and transform OWL (Web Ontology Language) items into a format suitable for downstream tasks. diff --git a/ontoaligner/encoder/llm.py b/ontoaligner/encoder/llm.py index 8158ecf..f624f82 100644 --- a/ontoaligner/encoder/llm.py +++ b/ontoaligner/encoder/llm.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from typing import Any, Dict from .encoders import LLMEncoder diff --git a/ontoaligner/encoder/rag.py b/ontoaligner/encoder/rag.py index 42b919f..c6b30cd 100644 --- a/ontoaligner/encoder/rag.py +++ b/ontoaligner/encoder/rag.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines three encoder classes that extend the `RAGEncoder` class to specialize in encoding OWL items representing different ontology concepts. These encoders use a retrieval-based approach along with a language model diff --git a/ontoaligner/ontology/__init__.py b/ontoaligner/ontology/__init__.py index fdadfcb..0ba3e5b 100644 --- a/ontoaligner/ontology/__init__.py +++ b/ontoaligner/ontology/__init__.py @@ -1,3 +1,15 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .oaei import * # NOQA from .generic import GenericOntology, GenericOMDataset diff --git a/ontoaligner/ontology/generic.py b/ontoaligner/ontology/generic.py index 5961f70..fc90990 100644 --- a/ontoaligner/ontology/generic.py +++ b/ontoaligner/ontology/generic.py @@ -1,3 +1,16 @@ +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from rdflib import Graph, URIRef, RDFS, SKOS, BNode from rdflib.namespace import OWL, RDF from tqdm import tqdm diff --git a/ontoaligner/ontology/oaei/__init__.py b/ontoaligner/ontology/oaei/__init__.py index 98f1a88..7d351bb 100644 --- a/ontoaligner/ontology/oaei/__init__.py +++ b/ontoaligner/ontology/oaei/__init__.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .anatomy import * # NOQA from .biodiv import * # NOQA from .bioml import * # NOQA diff --git a/ontoaligner/ontology/oaei/anatomy.py b/ontoaligner/ontology/oaei/anatomy.py index 57bc8d6..7fa7384 100644 --- a/ontoaligner/ontology/oaei/anatomy.py +++ b/ontoaligner/ontology/oaei/anatomy.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines ontology parsers for the Mouse and Human ontologies, extending the base ontology parser. """ diff --git a/ontoaligner/ontology/oaei/biodiv.py b/ontoaligner/ontology/oaei/biodiv.py index a6714db..041ab7b 100644 --- a/ontoaligner/ontology/oaei/biodiv.py +++ b/ontoaligner/ontology/oaei/biodiv.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines ontology parsers for various ontologies (ENVO, SWEET, SeaLife, TAXREFLD, NCBI) and creates specific dataset classes for each ontology pairing. The ontology parsers extract various diff --git a/ontoaligner/ontology/oaei/bioml.py b/ontoaligner/ontology/oaei/bioml.py index 93a9bce..a7ede10 100644 --- a/ontoaligner/ontology/oaei/bioml.py +++ b/ontoaligner/ontology/oaei/bioml.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines ontology parsers and datasets for bioinformatics-related tasks, specifically for processing disease-related ontologies and alignment data. The main objective is to handle ontological data, parse TSV diff --git a/ontoaligner/ontology/oaei/commonkg.py b/ontoaligner/ontology/oaei/commonkg.py index 883b506..3867d47 100644 --- a/ontoaligner/ontology/oaei/commonkg.py +++ b/ontoaligner/ontology/oaei/commonkg.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines classes for parsing and processing datasets that are based on the CommonKG (Common Knowledge Graph) ontology. The main objective is to provide ontology diff --git a/ontoaligner/ontology/oaei/food.py b/ontoaligner/ontology/oaei/food.py index af0e81b..fdd5e32 100644 --- a/ontoaligner/ontology/oaei/food.py +++ b/ontoaligner/ontology/oaei/food.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines classes for parsing and processing datasets related to food ontologies. It includes a class for parsing the `FoodOntology` and a dataset configurations. diff --git a/ontoaligner/ontology/oaei/mse.py b/ontoaligner/ontology/oaei/mse.py index e8465c4..5af858d 100644 --- a/ontoaligner/ontology/oaei/mse.py +++ b/ontoaligner/ontology/oaei/mse.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines several classes for parsing and processing ontologies related to material science and engineering. It includes helper functions for string manipulation and diff --git a/ontoaligner/ontology/oaei/phenotype.py b/ontoaligner/ontology/oaei/phenotype.py index f53822c..d1278c5 100644 --- a/ontoaligner/ontology/oaei/phenotype.py +++ b/ontoaligner/ontology/oaei/phenotype.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines ontology parsers for various disease-related ontologies, including DOID (Disease Ontology), ORDO (Orphanet Rare Disease Ontology), HP (Human Phenotype Ontology), and MP (Mammalian Phenotype Ontology). It also defines dataset classes that map between source and target ontologies diff --git a/ontoaligner/pipeline.py b/ontoaligner/pipeline.py index 9d7c36b..f36b4bd 100644 --- a/ontoaligner/pipeline.py +++ b/ontoaligner/pipeline.py @@ -1,3 +1,16 @@ +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Ontology Alignment Pipeline. Various methods such as lightweight matching, retriever-based matching, LLM-based matching, and RAG (Retriever-Augmented Generation) techniques has been applied. @@ -106,17 +119,23 @@ def __call__(self, method: str, encoder_model: BaseEncoder = None, model_class: Returns: dict or None: Evaluation report if `evaluate` is True. Matching results if `return_matching` is True. """ + if not (0 <= fuzzy_sm_threshold <= 1): + raise ValueError(f"fuzzy_sm_threshold must be between 0 and 1. Got {fuzzy_sm_threshold}") + + if method not in ["lightweight", "retrieval", "llm"] and "rag" not in method: + raise ValueError(f"Unknown method: {method}") + if method == "lightweight": matchings = self._run_lightweight(encoder_model or ConceptLightweightEncoder(), model_class or SimpleFuzzySMLightweight, postprocessor, fuzzy_sm_threshold) - elif method == "retrieval": + if method == "retrieval": matchings = self._run_retriever(encoder_model or ConceptLightweightEncoder(), model_class or SBERTRetrieval, postprocessor or retriever_postprocessor, retriever_path, device, top_k, ir_threshold) - elif method == "llm": + if method == "llm": matchings = self._run_llm(encoder_model or ConceptLLMEncoder(), model_class or AutoModelDecoderLLM, dataset_class or ConceptLLMDataset, postprocessor or llm_postprocessor, llm_mapper or TFIDFLabelMapper(classifier=LogisticRegression(), ngram_range=(1, 1)), llm_mapper_interested_class, llm_path, device, batch_size, max_length, max_new_tokens, llm_threshold) - elif 'rag' in method: + if 'rag' in method: retriever_config = {"device": device, "top_k": top_k, "openai_key": openai_key} llm_config = {"device": device, "batch_size": batch_size, "answer_set": answer_set, "huggingface_access_token": huggingface_access_token, "max_length": max_length, "max_new_tokens": max_new_tokens, "openai_key": openai_key, "device_map": device_map} @@ -129,8 +148,6 @@ def __call__(self, method: str, encoder_model: BaseEncoder = None, model_class: encoder_model = encoder_model or ConceptRAGEncoder() matchings = self._run_rag(method, encoder_model, model_class, postprocessor or rag_hybrid_postprocessor, llm_threshold, ir_rag_threshold, retriever_path, llm_path, rag_config) - else: - raise ValueError(f"Unknown method: {method}") return self._process_results(matchings, method, evaluate, return_matching, output_file_name, save_matchings) def _run_lightweight(self, encoder_model, model_class, postprocessor, fuzzy_sm_threshold): diff --git a/ontoaligner/postprocess/__init__.py b/ontoaligner/postprocess/__init__.py index 8239953..ce80de0 100644 --- a/ontoaligner/postprocess/__init__.py +++ b/ontoaligner/postprocess/__init__.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .process import * # NOQA from .label_mapper import * # NOQA from .util import * # NOQA diff --git a/ontoaligner/postprocess/label_mapper.py b/ontoaligner/postprocess/label_mapper.py index bb986b9..1321945 100644 --- a/ontoaligner/postprocess/label_mapper.py +++ b/ontoaligner/postprocess/label_mapper.py @@ -1,3 +1,16 @@ +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script provides an implementation of label mapping using different machine learning approaches. It defines a base `LabelMapper` class and two specific subclasses: @@ -51,7 +64,7 @@ def validate_predicts(self, preds: List[str]): """ for pred in preds: if pred.lower() not in self.labels: - print(f"{pred} in prediction is not a valid label!") + raise AssertionError(f"{pred} in prediction is not a valid label!") def predict(self, X: List[str]) -> List[str]: """ @@ -117,9 +130,9 @@ class SBERTLabelMapper(LabelMapper): Example usage: >>> label_dict = { - "yes":["yes", "correct", "true"], - "no":["no", "incorrect", "false"] - } + >>> "yes":["yes", "correct", "true"], + >>> "no":["no", "incorrect", "false"] + >>> } >>> mapper = SBERTLabelMapper("all-MiniLM-L12-v2", label_dict) >>> mapper.fit() >>> mapper.predict(["yes", "correct", "false", "nice", "too bad", "very good"]) diff --git a/ontoaligner/postprocess/process.py b/ontoaligner/postprocess/process.py index 1eb993a..82efd88 100644 --- a/ontoaligner/postprocess/process.py +++ b/ontoaligner/postprocess/process.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script contains functions to preprocess, evaluate, and filter outputs generated by information retrieval (IR) systems and language models (LLMs), including confidence diff --git a/ontoaligner/postprocess/util.py b/ontoaligner/postprocess/util.py index c089513..a7e1c48 100644 --- a/ontoaligner/postprocess/util.py +++ b/ontoaligner/postprocess/util.py @@ -1,3 +1,16 @@ +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Set of helper functions for post-processing methods. diff --git a/ontoaligner/utils/__init__.py b/ontoaligner/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/ontoaligner/utils/metrics.py b/ontoaligner/utils/metrics.py index 29189aa..5864c54 100644 --- a/ontoaligner/utils/metrics.py +++ b/ontoaligner/utils/metrics.py @@ -1,4 +1,16 @@ -# -*- coding: utf-8 -*- +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This script defines functions for evaluating the intersection between predicted and reference data, as well as calculating various evaluation metrics such as precision, recall, and F-score. diff --git a/ontoaligner/utils/xmlify.py b/ontoaligner/utils/xmlify.py index 80738df..5099253 100644 --- a/ontoaligner/utils/xmlify.py +++ b/ontoaligner/utils/xmlify.py @@ -1,3 +1,16 @@ +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This module provides functionality to generate XML alignment files compliant with the Alignment API. It is useful for representing ontology matching results in a standardized XML format. diff --git a/tests/aligners/test_lightweight_aligner.py b/tests/aligners/test_lightweight_aligner.py new file mode 100644 index 0000000..6d32529 --- /dev/null +++ b/tests/aligners/test_lightweight_aligner.py @@ -0,0 +1,218 @@ +import unittest +from ontoaligner.aligner.lightweight.lightweight import Lightweight, FuzzySMLightweight +from ontoaligner.aligner.lightweight.models import ( + SimpleFuzzySMLightweight, + WeightedFuzzySMLightweight, + TokenSetFuzzySMLightweight, +) +from rapidfuzz import fuzz + + +class TestLightweightAligner(unittest.TestCase): + def setUp(self): + self.lightweight = Lightweight(fuzzy_sm_threshold=0.5) + self.fuzzy_lightweight = FuzzySMLightweight(fuzzy_sm_threshold=0.7) + + # Test data + self.source_ontology = [ + { + "iri": "http://example.org/source1", + "text": "Computer Science", + "label": "Computer Science", + }, + { + "iri": "http://example.org/source2", + "text": "Information Technology", + "label": "Information Technology", + }, + ] + + self.target_ontology = [ + { + "iri": "http://example.org/target1", + "text": "Computer Sciences", + "label": "Computer Sciences", + }, + { + "iri": "http://example.org/target2", + "text": "Information Tech", + "label": "Information Tech", + }, + { + "iri": "http://example.org/target3", + "text": "Software Engineering", + "label": "Software Engineering", + }, + ] + + def test_lightweight_initialization(self): + """Test initialization of Lightweight aligner""" + self.assertEqual(self.lightweight.kwargs["fuzzy_sm_threshold"], 0.5) + self.assertEqual(str(self.lightweight), "Lightweight") + + def test_lightweight_init_retriever(self): + """Test init_retriever method of Lightweight aligner""" + # Should not raise any exception + try: + self.lightweight.init_retriever(None) + except Exception as e: + self.fail(f"init_retriever raised an exception: {str(e)}") + + def test_lightweight_generate(self): + """Test generate method of Lightweight aligner""" + # Base Lightweight generate method should return None + result = self.lightweight.generate([self.source_ontology, self.target_ontology]) + self.assertIsNone(result) + + def test_fuzzy_lightweight_initialization(self): + """Test initialization of FuzzySMLightweight aligner""" + self.assertEqual(self.fuzzy_lightweight.kwargs["fuzzy_sm_threshold"], 0.7) + self.assertEqual(str(self.fuzzy_lightweight), "Lightweight") + + def test_fuzzy_lightweight_ratio_estimate(self): + """Test ratio_estimate method of FuzzySMLightweight aligner""" + # Should not raise any exception and return None (as it's a placeholder) + result = self.fuzzy_lightweight.ratio_estimate() + self.assertIsNone(result) + + def test_fuzzy_lightweight_calculate_similarity(self): + """Test calculate_similarity method of FuzzySMLightweight aligner""" + # Set up ratio_estimate to use RapidFuzz ratio + self.fuzzy_lightweight.ratio_estimate = lambda: fuzz.ratio + + # Test case 1: Exact match + source = "Computer Science" + candidates = ["Computer Science", "Information Tech", "Software Engineering"] + idx, score = self.fuzzy_lightweight.calculate_similarity(source, candidates) + self.assertEqual(idx, 0) + self.assertEqual(score, 1.0) # Score should be normalized between 0 and 1 + + # Test case 2: Close match with different casing + source = "computer science" + candidates = ["Computer Science", "Information Tech", "Software Engineering"] + idx, score = self.fuzzy_lightweight.calculate_similarity(source, candidates) + self.assertEqual(idx, 0) + self.assertGreater( + score, 0.9 + ) # Should be high similarity despite case difference + + # Test case 3: Partial match + source = "Computer" + candidates = ["Computer Science", "Information Tech", "Software Engineering"] + idx, score = self.fuzzy_lightweight.calculate_similarity(source, candidates) + self.assertEqual(idx, 0) + self.assertLess(score, 1.0) # Should be partial match + + # Test case 4: No good match + source = "Mathematics" + candidates = ["Computer Science", "Information Tech", "Software Engineering"] + idx, score = self.fuzzy_lightweight.calculate_similarity(source, candidates) + self.assertLess(score, 0.7) # Compare with normalized threshold + + def test_fuzzy_lightweight_generate(self): + """Test generate method of FuzzySMLightweight aligner""" + # Set up ratio_estimate to use RapidFuzz ratio + self.fuzzy_lightweight.ratio_estimate = lambda: fuzz.ratio + + predictions = self.fuzzy_lightweight.generate( + [self.source_ontology, self.target_ontology] + ) + + self.assertIsInstance(predictions, list) + self.assertGreater(len(predictions), 0) + + # Check prediction structure + for pred in predictions: + self.assertIsInstance(pred, dict) # Predictions should be dictionaries + self.assertIn("source", pred) # Changed back to match actual output format + self.assertIn("target", pred) # Changed back to match actual output format + self.assertIn("score", pred) + self.assertIsInstance(pred["source"], str) # Should be IRI string + self.assertIsInstance(pred["target"], str) # Should be IRI string + self.assertIsInstance(pred["score"], (int, float)) + self.assertGreaterEqual( + pred["score"], self.fuzzy_lightweight.kwargs["fuzzy_sm_threshold"] + ) + + def test_fuzzy_lightweight_threshold_filtering(self): + """Test threshold filtering in FuzzySMLightweight aligner""" + # Test with different thresholds + thresholds = [0.5, 0.7, 0.9, 0.99] + test_cases = [ + { + "source": { + "iri": "http://example.org/src1", + "text": "Computer Science", + "label": "Computer Science", + }, + "target": { + "iri": "http://example.org/tgt1", + "text": "Computer Sciences", + "label": "Computer Sciences", + }, + "expected_matches": [ + True, + True, + True, + False, + ], # Whether it should match at each threshold + }, + { + "source": { + "iri": "http://example.org/src2", + "text": "Information Technology", + "label": "Information Technology", + }, + "target": { + "iri": "http://example.org/tgt2", + "text": "Info Tech", + "label": "Info Tech", + }, + "expected_matches": [True, False, False, False], + }, + ] + + for threshold, test_case in [(t, tc) for t in thresholds for tc in test_cases]: + aligner = FuzzySMLightweight(fuzzy_sm_threshold=threshold) + aligner.ratio_estimate = lambda: fuzz.ratio + + predictions = aligner.generate( + [[test_case["source"]], [test_case["target"]]] + ) + + match_found = len(predictions) > 0 + expected_match = test_case["expected_matches"][thresholds.index(threshold)] + + self.assertEqual( + match_found, + expected_match, + f"Failed with threshold {threshold} for {test_case['source']['text']} -> {test_case['target']['text']}", + ) + + +def test_simple_fuzzy_lightweight(): + """Test SimpleFuzzySMLightweight class.""" + aligner = SimpleFuzzySMLightweight() + assert aligner is not None + assert str(aligner) == "Lightweight-SimpleFuzzySMLightweight" + assert aligner.ratio_estimate() == fuzz.ratio + + +def test_weighted_fuzzy_lightweight(): + """Test WeightedFuzzySMLightweight class.""" + aligner = WeightedFuzzySMLightweight() + assert aligner is not None + assert str(aligner) == "Lightweight-WeightedFuzzySMLightweight" + assert aligner.ratio_estimate() == fuzz.WRatio + + +def test_token_set_fuzzy_lightweight(): + """Test TokenSetFuzzySMLightweight class.""" + aligner = TokenSetFuzzySMLightweight() + assert aligner is not None + assert str(aligner) == "Lightweight-TokenSetFuzzySMLightweight" + assert aligner.ratio_estimate() == fuzz.token_set_ratio + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/aligners/test_rag_advanced.py b/tests/aligners/test_rag_advanced.py new file mode 100644 index 0000000..cd7181c --- /dev/null +++ b/tests/aligners/test_rag_advanced.py @@ -0,0 +1,128 @@ +import pytest +import torch +from unittest.mock import MagicMock +from ontoaligner.aligner.rag.rag import ( + RAGBasedDecoderLLMArch, + OpenAIRAGLLM, +) + + +@pytest.fixture +def mock_tokenizer(): + tokenizer = MagicMock() + tokenizer.encode.return_value = [0, 1] + tokenizer.decode.return_value = "test" + tokenizer.eos_token_id = 2 + return tokenizer + + +@pytest.fixture +def mock_model(): + model = MagicMock() + model.generate.return_value = MagicMock( + scores=[torch.rand(1, 10)], sequences=torch.tensor([[1, 2, 3]]) + ) + return model + + +def test_model_initialization_parameters(): + """Test model initialization with different parameter combinations.""" + # Test with minimal parameters + model = RAGBasedDecoderLLMArch() + assert model is not None + assert "yes" in model.ANSWER_SET + assert "no" in model.ANSWER_SET + + # Test with custom answer set + custom_answers = {"yes": ["positive", "affirmative"], "no": ["negative", "false"]} + model = RAGBasedDecoderLLMArch(answer_set=custom_answers) + assert model.ANSWER_SET == custom_answers + + # Test with device specification + model = RAGBasedDecoderLLMArch( + device="cuda" if torch.cuda.is_available() else "cpu" + ) + assert "device" in model.kwargs + + +def test_model_string_representations(): + """Test string representations of different model types.""" + models = [ + (RAGBasedDecoderLLMArch(), "RAGBasedDecoderLLMArch"), + (OpenAIRAGLLM(), "RAGBasedOpenAILLMArch-OpenAILLM"), + ] + + for model, expected_str in models: + assert str(model) == expected_str + + +def test_model_generation_with_different_inputs(mock_tokenizer, mock_model): + """Test model generation with various input types.""" + model = RAGBasedDecoderLLMArch() + model.tokenizer = mock_tokenizer + model.model = mock_model + model.kwargs = {"max_new_tokens": 10} + + # Test with different input types + inputs = [ + {"input_ids": torch.tensor([[1, 2, 3]])}, + { + "input_ids": torch.tensor([[4, 5, 6]]), + "attention_mask": torch.tensor([[1, 1, 1]]), + }, + { + "input_ids": torch.tensor([[7, 8, 9]]), + "token_type_ids": torch.tensor([[0, 0, 0]]), + }, + ] + + for input_data in inputs: + output = model.generate_for_llm(input_data) + assert output is not None + assert hasattr(output, "scores") + assert hasattr(output, "sequences") + + +def test_openai_model_response_handling(): + """Test OpenAI model's handling of different response formats.""" + model = OpenAIRAGLLM() + + # Test various response formats + test_responses = [ + "Yes, these concepts are equivalent.", + "No, these are different concepts.", + "These concepts appear to be the same.", + "The concepts are not related.", + "Based on the context, yes.", + "Cannot determine the relationship.", + ] + + mock_responses = [] + for response in test_responses: + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = response + mock_responses.append(mock_response) + + sequences, probas = model.post_processor(mock_responses) + + assert len(sequences) == len(test_responses) + assert all(s in ["yes", "no"] for s in sequences) + assert all(0 <= p <= 1 for p in probas) + + +def test_model_error_handling(): + """Test model's error handling capabilities.""" + model = RAGBasedDecoderLLMArch() + + # Test with invalid input + with pytest.raises(Exception): + model.generate(None) + + # Test with empty input + with pytest.raises(Exception): + model.generate([]) + + # Test with malformed input + with pytest.raises(Exception): + model.generate([{"invalid": "data"}]) diff --git a/tests/aligners/test_rag_aligner.py b/tests/aligners/test_rag_aligner.py new file mode 100644 index 0000000..d12893e --- /dev/null +++ b/tests/aligners/test_rag_aligner.py @@ -0,0 +1,228 @@ +import unittest +import torch +from unittest.mock import MagicMock, patch +from transformers import AutoTokenizer, AutoModelForCausalLM +from ontoaligner.aligner.rag.rag import ( + RAGBasedDecoderLLMArch, + RAGBasedOpenAILLMArch, + RAG, + AutoModelDecoderRAGLLM, + AutoModelDecoderRAGLLMV2, + OpenAIRAGLLM, + MambaSSMRAGLLM, +) + + +class TestRAGBasedDecoderLLMArch(unittest.TestCase): + def setUp(self): + self.model = RAGBasedDecoderLLMArch() + # Mock tokenizer + self.model.tokenizer = MagicMock() + self.model.tokenizer.encode.return_value = [0, 1] # Mock tokenizer output + self.model.tokenizer.decode.return_value = "test" # Mock decoding + + # Custom answer set for testing + self.test_answer_set = { + "yes": ["yes", "correct", "true"], + "no": ["no", "incorrect", "false"], + } + + def test_initialization(self): + """Test initialization with default and custom answer sets""" + # Test default initialization + model = RAGBasedDecoderLLMArch() + self.assertTrue(all(key in model.ANSWER_SET for key in ["yes", "no"])) + + # Test custom answer set + model = RAGBasedDecoderLLMArch(answer_set=self.test_answer_set) + self.assertEqual(model.ANSWER_SET, self.test_answer_set) + + @patch("transformers.AutoTokenizer.from_pretrained") + @patch("transformers.AutoModelForCausalLM.from_pretrained") + def test_load(self, mock_model_class, mock_tokenizer_class): + """Test model loading and token ID initialization""" + # Set up mocks + mock_tokenizer = MagicMock() + mock_tokenizer.encode.return_value = [0, 1] + mock_tokenizer_class.return_value = mock_tokenizer + + mock_model = MagicMock() + mock_model_class.return_value = mock_model + + # Set up model attributes + self.model.model = AutoModelForCausalLM + self.model.tokenizer = AutoTokenizer + self.model.kwargs = { + "device": "cpu", + "huggingface_access_token": "dummy_token", + "device_map": None, + } + + self.model.load("dummy_path") + + # Verify tokenizer and model were loaded + mock_tokenizer_class.assert_called_once_with("dummy_path") + mock_model_class.assert_called_once_with("dummy_path", token="dummy_token") + + def test_check_answer_set_tokenizer(self): + """Test answer set tokenizer validation""" + # Mock tokenizer behavior + self.model.tokenizer = MagicMock() + # Mock encode to return exactly 2 tokens for valid input + self.model.tokenizer.encode.return_value = [0, 1] # Mock valid tokenization + self.model.tokenizer.decode.side_effect = lambda x: ( + "test" if x == [0, 1] else "other" + ) + + # Mock the answer set + self.model.ANSWER_SET = {"yes": ["test"], "no": ["test2"]} + + # Test with valid input that should return exactly 2 tokens + self.model.tokenizer.input_ids = [0, 1] # Mock input_ids property + self.model.tokenizer.return_value = MagicMock( + input_ids=[0, 1] + ) # Mock tokenizer call result + result = self.model.check_answer_set_tokenizer("test") + self.assertTrue( + result + ) # Should return True since tokenization returns exactly 2 tokens + + @patch("torch.no_grad") + def test_generate_for_llm(self, mock_no_grad): + """Test LLM generation""" + self.model.model = MagicMock() + self.model.tokenizer = MagicMock() + self.model.kwargs = {"max_new_tokens": 10} + + test_input = {"input_ids": torch.tensor([[1, 2, 3]])} + self.model.generate_for_llm(test_input) + + # Verify model.generate was called + self.model.model.generate.assert_called_once() + + +class TestRAGBasedOpenAILLMArch(unittest.TestCase): + def setUp(self): + self.model = RAGBasedOpenAILLMArch() + # Set up default answer set + self.model.ANSWER_SET = { + "yes": ["yes", "correct", "true"], + "no": ["no", "incorrect", "false"], + } + + def test_initialization(self): + """Test basic initialization""" + self.assertEqual(str(self.model), "RAGBasedOpenAILLMArch") + self.assertIsNotNone(self.model.ANSWER_SET) + self.assertTrue(all(key in self.model.ANSWER_SET for key in ["yes", "no"])) + self.assertIsInstance(self.model.ANSWER_SET, dict) + + def test_post_processor(self): + """Test post-processing of generated texts""" + # Create mock OpenAI response objects + mock_response1 = MagicMock() + mock_response1.choices = [MagicMock()] + mock_response1.choices[0].message.content = "Yes, that's correct." + + mock_response2 = MagicMock() + mock_response2.choices = [MagicMock()] + mock_response2.choices[0].message.content = "No, that's wrong." + + test_responses = [mock_response1, mock_response2] + results = self.model.post_processor(test_responses) + + self.assertIsInstance(results, list) + self.assertEqual(len(results), len(test_responses)) + + +class TestRAG(unittest.TestCase): + @patch("ontoaligner.aligner.rag.rag.RAGBasedDecoderLLMArch") + def setUp(self, mock_llm_class): + # Create mock configs + self.retriever_config = {"param1": "value1"} + self.llm_config = {"param2": "value2"} + + # Create mock Retrieval and LLM instances + self.mock_retrieval = MagicMock() + self.mock_llm = MagicMock() + + # Set up the mock LLM class + mock_llm_class.return_value = self.mock_llm + + # Create a mock for the base class initialization + with patch("ontoaligner.base.model.BaseOMModel.__init__") as mock_base_init: + mock_base_init.return_value = None + # Initialize RAG with mock configs and components + with patch.object(RAG, "Retrieval", return_value=self.mock_retrieval): + with patch.object(RAG, "LLM", return_value=self.mock_llm): + self.rag = RAG( + retriever_config=self.retriever_config, + llm_config=self.llm_config, + ) + # Set up the kwargs manually since we mocked the base init + self.rag.kwargs = { + "retriever-config": self.retriever_config, + "llm-config": self.llm_config, + } + + +class TestAutoModelDecoderRAGLLM(unittest.TestCase): + def setUp(self): + self.model = AutoModelDecoderRAGLLM() + + def test_initialization(self): + """Test initialization and attributes""" + self.assertEqual( + str(self.model), "RAGBasedDecoderLLMArch-AutoModel" + ) # Updated to match actual implementation + self.assertEqual(self.model.tokenizer, AutoTokenizer) + self.assertEqual(self.model.model, AutoModelForCausalLM) + + +class TestAutoModelDecoderRAGLLMV2(unittest.TestCase): + def setUp(self): + self.model = AutoModelDecoderRAGLLMV2() + + def test_initialization(self): + """Test initialization and attributes""" + self.assertEqual( + str(self.model), "RAGBasedDecoderLLMArch-AutoModelV2" + ) # Updated to match actual implementation + self.assertEqual(self.model.tokenizer, AutoTokenizer) + self.assertEqual(self.model.model, AutoModelForCausalLM) + + @patch("torch.no_grad") + def test_get_probas_yes_no(self, mock_no_grad): + """Test probability calculation for yes/no answers""" + # Mock outputs with scores + mock_outputs = MagicMock() + mock_outputs.scores = [torch.rand(1, 10)] # Random scores for testing + + self.model.answer_sets_token_id = {"yes": [1, 2, 3], "no": [4, 5, 6]} + + probas = self.model.get_probas_yes_no(mock_outputs) + self.assertIsInstance(probas, torch.Tensor) + + +class TestOpenAIRAGLLM(unittest.TestCase): + def setUp(self): + self.model = OpenAIRAGLLM() + + def test_initialization(self): + """Test initialization and string representation""" + self.assertEqual(str(self.model), "RAGBasedOpenAILLMArch-OpenAILLM") + + +class TestMambaSSMRAGLLM(unittest.TestCase): + def setUp(self): + self.model = MambaSSMRAGLLM() + + def test_initialization(self): + """Test initialization and string representation""" + self.assertEqual(str(self.model), "RAGBasedDecoderLLMArch-AutoModelV2-MambaSSM") + self.assertEqual(self.model.tokenizer, AutoTokenizer) + self.assertEqual(self.model.model, AutoModelForCausalLM) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c6fefc5 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,21 @@ +import pytest +from pathlib import Path +from ontoaligner.ontology import GenericOntology + + +@pytest.fixture +def test_data_dir(): + return Path(__file__).parent / "data" + + +@pytest.fixture +def sample_ontology(test_data_dir): + ontology_path = test_data_dir / "test-case1.owl" + return GenericOntology().load_ontology(str(ontology_path)) + + +@pytest.fixture +def temp_output_dir(tmp_path): + output_dir = tmp_path / "test_output" + output_dir.mkdir(exist_ok=True) + return output_dir diff --git a/tests/label_mapper/test_label_mapper.py b/tests/label_mapper/test_label_mapper.py new file mode 100644 index 0000000..429645f --- /dev/null +++ b/tests/label_mapper/test_label_mapper.py @@ -0,0 +1,113 @@ +import unittest +from sklearn.linear_model import LogisticRegression +from ontoaligner.postprocess.label_mapper import ( + LabelMapper, + TFIDFLabelMapper, + SBERTLabelMapper, +) + + +class TestLabelMapper(unittest.TestCase): + def setUp(self): + """Set up test fixtures before each test method.""" + self.default_label_dict = { + "yes": ["yes", "correct", "true"], + "no": ["no", "incorrect", "false"], + } + self.custom_label_dict = { + "match": ["match", "equivalent", "same"], + "no_match": ["no match", "different", "distinct"], + } + + def test_label_mapper_initialization(self): + """Test the initialization of the base LabelMapper class.""" + # Test with default label dictionary + mapper = LabelMapper() + self.assertEqual(mapper.labels, ["yes", "no"]) + self.assertEqual(len(mapper.x_train), len(mapper.y_train)) + + # Test with custom label dictionary + mapper = LabelMapper(label_dict=self.custom_label_dict) + self.assertEqual(mapper.labels, ["match", "no_match"]) + self.assertEqual(len(mapper.x_train), len(mapper.y_train)) + + def test_label_mapper_validation(self): + """Test the validation of predictions.""" + mapper = LabelMapper() + # Test with valid predictions + valid_preds = ["yes", "no"] + mapper.validate_predicts(valid_preds) # Should not raise any exception + + # Test with invalid predictions + invalid_preds = ["yes", "maybe", "no"] + with self.assertRaises(AssertionError): + mapper.validate_predicts(invalid_preds) + + +class TestTFIDFLabelMapper(unittest.TestCase): + def setUp(self): + """Set up test fixtures before each test method.""" + self.label_dict = { + "yes": ["yes", "correct", "true"], + "no": ["no", "incorrect", "false"], + } + self.classifier = LogisticRegression() + self.mapper = TFIDFLabelMapper( + classifier=self.classifier, ngram_range=(1, 1), label_dict=self.label_dict + ) + + def test_tfidf_mapper_initialization(self): + """Test the initialization of TFIDFLabelMapper.""" + self.assertIsNotNone(self.mapper.model) + self.assertEqual(self.mapper.labels, ["yes", "no"]) + + def test_tfidf_mapper_fit_predict(self): + """Test the fit and predict methods of TFIDFLabelMapper.""" + # Fit the mapper + self.mapper.fit() + + # Test predictions + test_inputs = ["yes", "correct", "no", "false", "maybe"] + predictions = self.mapper.predict(test_inputs) + + # Verify predictions + self.assertEqual(len(predictions), len(test_inputs)) + for pred in predictions: + self.assertIn(pred.lower(), ["yes", "no"]) + + +class TestSBERTLabelMapper(unittest.TestCase): + def setUp(self): + """Set up test fixtures before each test method.""" + self.label_dict = { + "yes": ["yes", "correct", "true"], + "no": ["no", "incorrect", "false"], + } + self.model_id = "all-MiniLM-L12-v2" # Using a small model for testing + self.mapper = SBERTLabelMapper( + model_id=self.model_id, label_dict=self.label_dict + ) + + def test_sbert_mapper_initialization(self): + """Test the initialization of SBERTLabelMapper.""" + self.assertIsNotNone(self.mapper.embedder) + self.assertIsNotNone(self.mapper.classifier) + self.assertEqual(self.mapper.labels, ["yes", "no"]) + + def test_sbert_mapper_fit_predict(self): + """Test the fit and predict methods of SBERTLabelMapper.""" + # Fit the mapper + self.mapper.fit() + + # Test predictions + test_inputs = ["yes", "correct", "no", "false", "maybe"] + predictions = self.mapper.predict(test_inputs) + + # Verify predictions + self.assertEqual(len(predictions), len(test_inputs)) + for pred in predictions: + self.assertIn(pred.lower(), ["yes", "no"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/ontologies/test_generic_ontology.py b/tests/ontologies/test_generic_ontology.py new file mode 100644 index 0000000..d399095 --- /dev/null +++ b/tests/ontologies/test_generic_ontology.py @@ -0,0 +1,120 @@ +import unittest +import os +from rdflib import URIRef, RDFS, OWL, RDF, Graph +from ontoaligner.ontology.generic import GenericOntology +import ontoaligner + + +class TestGenericOntology(unittest.TestCase): + def setUp(self): + self.ontology = GenericOntology() + self.test_graph = Graph() + + # Load test-case1.owl + test_file_path = os.path.join( + os.path.dirname(__file__), "..", "data", "test-case1.owl" + ) + self.test_graph.parse(test_file_path) + + # Define URIs from test-case1.owl + self.animal_uri = URIRef("http://example.org/Animal") + self.mammal_uri = URIRef("http://example.org/Mammal") + self.dog_uri = URIRef("http://example.org/Dog") + + # Add explicit owl:Class type to Animal since it's the root + self.test_graph.add((self.animal_uri, RDF.type, OWL.Class)) + + self.ontology.graph = self.test_graph + + def test_get_label(self): + """Test label extraction with different scenarios""" + # Test with existing label + self.assertEqual(self.ontology.get_label(str(self.mammal_uri)), "Mammal") + self.assertEqual(self.ontology.get_label(str(self.dog_uri)), "Dog") + + # Test with non-existent class + self.assertEqual( + self.ontology.get_label("http://example.org/NonExistent"), + "NonExistent", # The method returns the last part of the URI if no label found + ) + + # Test with URI fragment + self.assertEqual( + self.ontology.get_label("http://example.org/TestClass#Fragment"), "Fragment" + ) + + # Test with URI path + self.assertEqual( + self.ontology.get_label("http://example.org/path/LastPart"), "LastPart" + ) + + def test_get_synonyms(self): + """Test synonym extraction""" + # No synonyms in test-case1.owl, should return empty list + synonyms = self.ontology.get_synonyms(self.mammal_uri) + self.assertEqual(len(synonyms), 0) + + def test_get_parents(self): + """Test parent class extraction""" + # Test Dog's parent (Mammal) - Dog has subClassOf relationship + parents = self.ontology.get_parents(self.dog_uri) + self.assertEqual(len(parents), 1) + self.assertEqual(parents[0]["iri"], str(self.mammal_uri)) + self.assertEqual(parents[0]["label"], "Mammal") + + def test_get_childrens(self): + """Test children class extraction""" + # Test Animal's children (Mammal) + children = self.ontology.get_childrens(self.animal_uri) + self.assertEqual(len(children), 1) + self.assertEqual(children[0]["label"], "Mammal") + + # Test Mammal's children (Dog) + children = self.ontology.get_childrens(self.mammal_uri) + self.assertEqual(len(children), 1) + self.assertEqual(children[0]["label"], "Dog") + + def test_get_comments(self): + """Test comment extraction""" + # No comments in test-case1.owl, should return empty list + comments = self.ontology.get_comments(self.mammal_uri) + self.assertEqual(len(comments), 0) + + def test_get_class_info(self): + """Test complete class info extraction""" + # Test Dog class info - Dog has subClassOf relationship + class_info = self.ontology.get_class_info(self.dog_uri) + self.assertIsNotNone(class_info) + self.assertEqual(class_info["label"], "Dog") + self.assertEqual(len(class_info["childrens"]), 0) # No children + self.assertEqual(len(class_info["parents"]), 1) # Mammal + self.assertEqual(len(class_info["synonyms"]), 0) # No synonyms + self.assertEqual(len(class_info["comment"]), 0) # No comments + + def test_generic_ontology_parser(self): + """Test that the parse function loads an ontology correctly.""" + ontology = ontoaligner.ontology.GenericOntology() + ontology_path = os.path.join( + os.path.dirname(__file__), "..", "data/test-case1.owl" + ) + data = ontology.parse(ontology_path) + + # Ensure parsed ontology data is not empty + self.assertGreater(len(data), 0) + + # Check expected subclass relationships + mammal = URIRef("http://example.org/Mammal") + animal = URIRef("http://example.org/Animal") + self.assertTrue((mammal, RDFS.subClassOf, animal) in ontology.graph) + + # Check expected labels + label_predicate = RDFS.label + expected_label = "Mammal" + found_labels = [ + str(o) for s, p, o in ontology.graph if s == mammal and p == label_predicate + ] + self.assertIn(expected_label, found_labels) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/ontologies/test_ontology.py b/tests/ontologies/test_ontology.py new file mode 100644 index 0000000..0a8353c --- /dev/null +++ b/tests/ontologies/test_ontology.py @@ -0,0 +1,14 @@ +import pytest +from ontoaligner.ontology import GenericOntology +from rdflib import Graph + + +def test_ontology_loading(sample_ontology): + """Test that ontology can be loaded successfully.""" + assert sample_ontology is not None + assert isinstance(sample_ontology, Graph) + +def test_invalid_ontology_path(): + """Test that loading invalid ontology path raises error.""" + with pytest.raises(Exception): + GenericOntology().load_ontology("nonexistent.owl") diff --git a/tests/ontologies/test_ontology_advanced.py b/tests/ontologies/test_ontology_advanced.py new file mode 100644 index 0000000..d3b7e8d --- /dev/null +++ b/tests/ontologies/test_ontology_advanced.py @@ -0,0 +1,117 @@ +import pytest +from rdflib import Graph, Literal, Namespace +from rdflib.namespace import RDF, RDFS, OWL +from ontoaligner.ontology import GenericOntology + + +@pytest.fixture +def large_ontology(): + """Create a large test ontology with many classes and relationships.""" + g = Graph() + ns = Namespace("http://example.org/") + + # Create 100 classes with relationships + for i in range(100): + class_uri = ns[f"Class_{i}"] + g.add((class_uri, RDF.type, OWL.Class)) + g.add((class_uri, RDFS.label, Literal(f"Test Class {i}"))) + + # Add subclass relationships + if i > 0: + g.add((class_uri, RDFS.subClassOf, ns[f"Class_{i-1}"])) + + # Add some comments and synonyms + g.add((class_uri, RDFS.comment, Literal(f"This is test class number {i}"))) + g.add((class_uri, RDFS.label, Literal(f"Alternative Name {i}"))) + + return g + +def test_invalid_ontology_format(): + """Test handling of invalid ontology formats.""" + with pytest.raises(Exception): + ontology = GenericOntology() + ontology.parse("nonexistent.owl") + + +def test_ontology_modification(): + """Test modifying ontology data after loading.""" + ontology = GenericOntology() + + # Create a simple test graph + g = Graph() + ns = Namespace("http://example.org/") + class_uri = ns["TestClass"] + g.add((class_uri, RDF.type, OWL.Class)) + g.add((class_uri, RDFS.label, Literal("Test Class"))) + + # Add the graph to the ontology + ontology.graph = g + data = ontology.extract_data(g) + + # Verify initial state + assert len(data) == 1 + assert data[0]["label"] == "Test Class" + + # Modify the graph + g.add((class_uri, RDFS.comment, Literal("New comment"))) + + # Re-extract data and verify changes + updated_data = ontology.extract_data(g) + assert len(updated_data) == 1 + assert "New comment" in updated_data[0]["comment"] + + +def test_special_characters(): + """Test handling of special characters in ontology data.""" + ontology = GenericOntology() + + # Create a test graph with special characters + g = Graph() + ns = Namespace("http://example.org/") + special_chars = ["é", "ñ", "ß", "漢", "🌟"] + + for i, char in enumerate(special_chars): + class_uri = ns[f"Class_{i}"] + g.add((class_uri, RDF.type, OWL.Class)) + g.add((class_uri, RDFS.label, Literal(f"Test Class {char}"))) + + # Parse and verify + ontology.graph = g + data = ontology.extract_data(g) + + assert len(data) == len(special_chars) + for i, char in enumerate(special_chars): + assert f"Test Class {char}" in [item["label"] for item in data] + + +def test_circular_references(): + """Test handling of circular references in ontology.""" + ontology = GenericOntology() + + # Create a graph with circular references + g = Graph() + ns = Namespace("http://example.org/") + + # Create circular subclass relationship + class_a = ns["ClassA"] + class_b = ns["ClassB"] + class_c = ns["ClassC"] + + for class_uri in [class_a, class_b, class_c]: + g.add((class_uri, RDF.type, OWL.Class)) + g.add((class_uri, RDFS.label, Literal(str(class_uri)))) + + # Create circular reference: A -> B -> C -> A + g.add((class_a, RDFS.subClassOf, class_b)) + g.add((class_b, RDFS.subClassOf, class_c)) + g.add((class_c, RDFS.subClassOf, class_a)) + + # Should handle circular references without infinite recursion + ontology.graph = g + data = ontology.extract_data(g) + + assert len(data) == 3 + # Verify each class has both parent and child relationships + for item in data: + assert len(item["parents"]) > 0 + assert len(item["childrens"]) > 0 diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py new file mode 100644 index 0000000..476ff83 --- /dev/null +++ b/tests/pipeline/test_pipeline.py @@ -0,0 +1,67 @@ +import pytest +from ontoaligner.pipeline import OntoAlignerPipeline +from ontoaligner.ontology import GenericOMDataset + + +def test_pipeline_initialization(): + """Test that Pipeline can be initialized.""" + pipeline = OntoAlignerPipeline( + task_class=GenericOMDataset, + source_ontology_path="tests/data/test-case1.owl", + target_ontology_path="tests/data/test-case1.owl", + reference_matching_path=None, + ) + assert pipeline is not None + + +def test_pipeline_with_lightweight_aligner(sample_ontology, temp_output_dir): + """Test pipeline with lightweight aligner.""" + pipeline = OntoAlignerPipeline( + task_class=GenericOMDataset, + source_ontology_path="tests/data/test-case1.owl", + target_ontology_path="tests/data/test-case1.owl", + reference_matching_path=None, + output_dir=str(temp_output_dir), + ) + + result = pipeline( + method="lightweight", + fuzzy_sm_threshold=0.5, + save_matchings=True, + output_file_name="alignment_result", + ) + + assert result is not None + + +@pytest.mark.skip(reason="Requires OpenAI API key") +def test_pipeline_with_rag_aligner(sample_ontology, temp_output_dir): + """Test pipeline with RAG aligner.""" + pipeline = OntoAlignerPipeline( + task_class=GenericOMDataset, + source_ontology_path="tests/data/test-case1.owl", + target_ontology_path="tests/data/test-case1.owl", + reference_matching_path=None, + output_dir=str(temp_output_dir), + ) + + result = pipeline( + method="rag", save_matchings=True, output_file_name="rag_alignment_result" + ) + + assert result is not None + output_path = temp_output_dir / "rag_alignment_result.xml" + assert output_path.exists() + + +def test_pipeline_with_invalid_aligner(): + """Test pipeline with invalid aligner type.""" + pipeline = OntoAlignerPipeline( + task_class=GenericOMDataset, + source_ontology_path="tests/data/test-case1.owl", + target_ontology_path="tests/data/test-case1.owl", + reference_matching_path=None, + ) + + with pytest.raises(ValueError): + pipeline(method="invalid_aligner") diff --git a/tests/pipeline/test_pipeline_advanced.py b/tests/pipeline/test_pipeline_advanced.py new file mode 100644 index 0000000..12372e4 --- /dev/null +++ b/tests/pipeline/test_pipeline_advanced.py @@ -0,0 +1,124 @@ +import pytest +import shutil +from pathlib import Path +from ontoaligner.pipeline import OntoAlignerPipeline +from ontoaligner.ontology import GenericOMDataset +import sys +import gc + + +@pytest.fixture +def complex_pipeline(): + """Create a pipeline with complex configuration.""" + return OntoAlignerPipeline( + task_class=GenericOMDataset, + source_ontology_path="tests/data/test-case1.owl", + target_ontology_path="tests/data/test-case1.owl", + reference_matching_path=None, + output_dir="test_output", + output_format="xml" + ) + + +def test_pipeline_error_recovery(complex_pipeline): + """Test pipeline's error recovery capabilities.""" + # Test with invalid method + with pytest.raises(ValueError): + complex_pipeline(method="invalid_method") + + # Test with invalid threshold + with pytest.raises(ValueError): + complex_pipeline(method="lightweight", fuzzy_sm_threshold=2.0) + + # Test with missing output directory + shutil.rmtree("test_output", ignore_errors=True) + result = complex_pipeline( + method="lightweight", save_matchings=True, output_file_name="test" + ) + assert result is not None + assert Path("test_output/lightweight/test.xml").exists() + +def get_total_size(objects): + """Estimate total size of objects in memory.""" + seen = set() + size = 0 + for obj in objects: + if id(obj) not in seen: + seen.add(id(obj)) + try: + size += sys.getsizeof(obj) + except TypeError: + pass # Some built-in objects don't support getsizeof + return size + +def test_pipeline_resource_cleanup(complex_pipeline): + """Test for potential memory leaks without external libraries.""" + gc.collect() + initial_objects = gc.get_objects() + initial_size = get_total_size(initial_objects) + + for i in range(5): + result = complex_pipeline( + method="lightweight", + fuzzy_sm_threshold=0.5, + save_matchings=True, + output_file_name=f"cleanup_test_{i}", + ) + assert result is not None + + gc.collect() + final_objects = gc.get_objects() + final_size = get_total_size(final_objects) + + # Print difference (you can assert if needed) + print(f"Initial size: {initial_size / 1024:.2f} KB") + print(f"Final size: {final_size / 1024:.2f} KB") + print(f"Size difference: {(final_size - initial_size) / 1024:.2f} KB") + + # Allow some leeway (e.g. 10 MB) for object growth + assert (final_size - initial_size) < 10 * 1024 * 1024 # 10MB in bytes + + +def test_pipeline_large_ontology(tmp_path): + """Test pipeline with large ontologies.""" + # Create large test ontologies + source_onto = create_large_ontology(tmp_path / "source_large.owl", 1000) + target_onto = create_large_ontology(tmp_path / "target_large.owl", 1000) + + pipeline = OntoAlignerPipeline( + task_class=GenericOMDataset, + source_ontology_path=str(source_onto), + target_ontology_path=str(target_onto), + reference_matching_path=None, + output_dir=str(tmp_path / "output"), + ) + + result = pipeline( + method="lightweight", + fuzzy_sm_threshold=0.5, + save_matchings=True, + output_file_name="large_test", + ) + + assert result is not None + assert len(result) > 0 # Should have some matches + + +def create_large_ontology(file_path, num_classes): + """Helper function to create large test ontologies.""" + from rdflib import Graph, Literal, Namespace + from rdflib.namespace import RDF, RDFS, OWL + + g = Graph() + ns = Namespace("http://example.org/") + + for i in range(num_classes): + class_uri = ns[f"Class_{i}"] + g.add((class_uri, RDF.type, OWL.Class)) + g.add((class_uri, RDFS.label, Literal(f"Test Class {i}"))) + + if i > 0: + g.add((class_uri, RDFS.subClassOf, ns[f"Class_{i-1}"])) + + g.serialize(destination=str(file_path), format="xml") + return file_path diff --git a/tests/test_ontology.py b/tests/test_ontology.py deleted file mode 100644 index 2dc8804..0000000 --- a/tests/test_ontology.py +++ /dev/null @@ -1,30 +0,0 @@ -import unittest -import os -import ontoaligner -from rdflib import URIRef, RDFS - -class TestOntology(unittest.TestCase): - - def test_generic_ontology_parser(self): - """Test that the parse function loads an ontology correctly.""" - ontology = ontoaligner.ontology.GenericOntology() - ontology_path = os.path.join(os.path.dirname(__file__), "data/test-case1.owl") - data = ontology.parse(ontology_path) - - # Ensure parsed ontology data is not empty - self.assertGreater(len(data), 0) - - # Check expected subclass relationships - mammal = URIRef("http://example.org/Mammal") - animal = URIRef("http://example.org/Animal") - self.assertTrue((mammal, RDFS.subClassOf, animal) in ontology.graph) - - # Check expected labels - label_predicate = RDFS.label - expected_label = "Mammal" - found_labels = [str(o) for s, p, o in ontology.graph if s == mammal and p == label_predicate] - self.assertIn(expected_label, found_labels) - - -if __name__ == '__main__': - unittest.main()