add kge-retrieval & MacOS install issue #67

HamedBabaei · web-flow · commit 672f39d58962 · 2025-10-12T21:10:14.000+02:00
diff --git a/build.sh b/build.sh
diff --git a/docs/source/aligner/kge.rst b/docs/source/aligner/kge.rst
@@ -4,6 +4,11 @@ Knowledge Graph Embedding
 Graph Embeddings
 ---------------------------------
 
+.. sidebar:: **Reference:**
+
+    `OntoAligner Meets Knowledge Graph Embedding Aligners <https://arxiv.org/abs/2509.26417>`_
+
+
 Ontology alignment involves finding correspondences between entities in different ontologies. OntoAligner addresses this challenge by leveraging **Knowledge Graph Embedding (KGE)** models. The core idea of KGE is to represent entities (like classes, properties, individuals) and relations within an ontology as **low-dimensional vectors** in a continuous vector space. These numerical representations (embeddings) are learned to preserve semantic relationships from the original ontology geometrically in the embedding space.
 
 .. hint::
@@ -263,3 +268,74 @@ Here ``RESCAL`` is our custom KGE model.
 .. note::
 
     For possible models please take a look at `PyKEEN > Models <https://pykeen.readthedocs.io/en/latest/reference/models.html#classes>`_.
+
+KGE Retriever
+----------------------
+
+.. sidebar:: Key Parameters:
+
+		- ``retruever``: boolean
+		- ``top_K``: integer
+
+In addition to one-to-one alignments, OntoAligner also supports retriever-based alignment. When retriever mode is enabled (``retriever=True``), the aligner returns the top-k candidate target entities for each source entity, along with their similarity scores (similar to retriever aligner). This model is useful if you want to build downstream candidate filtering pipelines, apply human-in-the-loop validation, or integrate with reranking modules (e.g., LLMs or supervised classifiers).
+
+Here is the example on how to use KGE Aligner as a retriever model:
+
+.. code-block:: python
+
+    from ontoaligner.aligner import TransEAligner
+
+    # Enable retriever mode and request top-3 candidates per source entity
+    aligner = TransEAligner(retriever=True, top_k=3)
+
+    matchings = aligner.generate(input_data=encoded_dataset)
+
+.. list-table::
+   :widths: 20 80
+   :header-rows: 1
+
+   * - Mode
+     - Description
+
+   * - **KGE Default mode**
+     - In KGE aligners, the default mode is ``retriever=False``, where it produces **one-to-one** alignments, where each source entity is matched to the single most similar target entity.
+   * - **KGE Retriever mode**
+     - In KGE aligners, the default mode is ``retriever=True``, where it produces **one-to-many** alignments, where each source entity is matched to multiple target entities. Example output:
+
+
+
+.. tab:: ➡️ KGE Retriever Mode Example output
+
+	::
+
+		[
+		   {
+		     "source": "http://mouse.owl#MA_0000143",
+		     "target-cands": [
+		         "http://human.owl#HBA_0000214",
+		         "http://human.owl#HBA_0000762",
+		         "http://human.owl#HBA_0000891"
+		     ],
+		     "score-cands": [0.87, 0.82, 0.77]
+		   },
+		   ...
+		]
+
+
+
+.. tab:: ➡️ KGE Default Mode Example output
+
+	::
+
+		{
+		    'source': 'http://mouse.owl#MA_0000143',
+		    'target': 'http://human.owl#HBA_0000214',
+		    'score': 0.87
+		}
+
+
+.. note::
+
+    Consider reading the following section next:
+
+    * `Package Reference > Aligners <../package_reference/aligners.html>`_
diff --git a/docs/source/package_reference/aligners.rst b/docs/source/package_reference/aligners.rst
@@ -63,4 +63,4 @@ KGE Aligners
    :members:
    :undoc-members:
    :show-inheritance:
-   :special-members:
+   :special-members: __init__, generate
diff --git a/ontoaligner/aligner/graph/graph.py b/ontoaligner/aligner/graph/graph.py
@@ -52,11 +52,13 @@ class GraphEmbeddingAligner(BaseOMModel):
 
     def __init__(self,
                  device: str='cpu',
+                 retriever: bool = False,
                  embedding_dim: int=300,
                  num_epochs: int=50,
                  train_batch_size: int=128,
                  eval_batch_size: int=64,
                  num_negs_per_pos: int=5,
+                 top_k: int=5,
                  random_seed: int=42):
         """
         Initializes the GraphEmbeddingAligner with training configuration.
@@ -71,11 +73,13 @@ def __init__(self,
             random_seed (int): Random seed for reproducibility.
         """
         super().__init__(device=device,
+                         retriever=retriever,
                          embedding_dim=embedding_dim,
                          num_epochs=num_epochs,
                          train_batch_size=train_batch_size,
                          eval_batch_size=eval_batch_size,
                          num_negs_per_pos=num_negs_per_pos,
+                         top_k=top_k,
                          random_seed=random_seed)
 
     def fit(self, triplets: List):
@@ -108,6 +112,7 @@ def fit(self, triplets: List):
     def _similarity_matrix(self, source_onto_tensor, target_onto_tensor):
         return torch.matmul(source_onto_tensor, target_onto_tensor.T)
 
+
     def predict(self, source_onto: Dict, target_onto: Dict):
         """
         Aligns entities from the source ontology to entities in the target ontology
@@ -134,16 +139,45 @@ def predict(self, source_onto: Dict, target_onto: Dict):
 
         similarity_matrix = self._similarity_matrix(source_onto_tensor, target_onto_tensor) # shape: (n1, n2)
 
-        best_scores, best_indices = similarity_matrix.max(dim=1)
+        if self.kwargs['retriever']:
+            matches = self._retriever_predict(similarity_matrix=similarity_matrix,
+                                              source_ent2iri=source_ent2iri,
+                                              target_ent2iri=target_ent2iri,
+                                              source_ents=source_ents,
+                                              target_ents=target_ents,
+                                              top_k=self.kwargs['top_k'])
+        else:
+            matches = self._predict(similarity_matrix=similarity_matrix,
+                                    source_ent2iri=source_ent2iri,
+                                    target_ent2iri=target_ent2iri,
+                                    source_ents=source_ents,
+                                    target_ents=target_ents)
+        return matches
 
-        matches = [
-            {
+    def _retriever_predict(self, similarity_matrix, source_ent2iri, target_ent2iri, source_ents, target_ents, top_k):
+        best_scores, best_indices = similarity_matrix.topk(k=top_k, dim=1)
+        matches = []
+        for i, src in enumerate(source_ents):
+            target_cands, score_cands = [], []
+            for j in range(top_k):
+                target_cands.append(target_ent2iri[target_ents[best_indices[i][j].item()]])
+                score_cands.append(best_scores[i][j].item())
+            matches.append({
+                "source": source_ent2iri[src],
+                "target-cands": target_cands,
+                "score-cands": score_cands
+            })
+        return matches
+
+    def _predict(self, similarity_matrix, source_ent2iri, target_ent2iri, source_ents, target_ents):
+        best_scores, best_indices = similarity_matrix.max(dim=1)
+        matches = []
+        for index in range(len(source_ents)):
+            matches.append({
                 "source": source_ent2iri[source_ents[index]],
                 "target": target_ent2iri[target_ents[best_indices[index].item()]],
                 "score": best_scores[index].item()
-            }
-            for index in range(len(source_ents))
-        ]
+            })
         return matches
 
     def get_embeddings(self):
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,7 @@ openai = "1.56.0"
 rank_bm25 = "0.2.2"
 huggingface-hub = "^0.34.4"
 sentence-transformers = "^5.1.0"
-bitsandbytes = "^0.45.1"
+bitsandbytes = { version = ">=0.45.1,<1.0.0", markers = "platform_system == 'Linux'" }
 pykeen = "1.11.1"
 
 [tool.poetry.dev-dependencies]
diff --git a/requirements.txt b/requirements.txt
@@ -14,7 +14,7 @@ openai==1.56.0
 rank_bm25==0.2.2
 huggingface-hub==0.34.4
 sentence-transformers==5.1.0
-bitsandbytes==0.45.1
+bitsandbytes>=0.45.1,<0.46.0; platform_system == "Linux"
 pykeen==1.11.1
 ruff
 pre-commit
diff --git a/setup.py b/setup.py
@@ -31,7 +31,7 @@
         "torch>=2.8.0,<3.0.0",
         "transformers>=4.56.0,<5.0.0",
         "huggingface-hub>=0.34.4,<1.0.0",
-        "bitsandbytes>=0.45.1,<1.0.0",
+        "bitsandbytes>=0.45.1,<1.0.0; platform_system == 'Linux'",
         "pykeen==1.11.1"
     ],
     classifiers=[
diff --git a/tests/aligners/test_kge.py b/tests/aligners/test_kge.py
@@ -0,0 +1,105 @@
+import pytest
+from ontoaligner.aligner.graph import GraphEmbeddingAligner
+
+@pytest.fixture
+def toy_retriever_ontologies():
+    source_onto = {
+        "entity2iri": {
+            "s1": "http://source.org/1",
+            "s2": "http://source.org/2",
+        },
+        "triplets": [
+            ("s1", "relatedTo", "s2")
+        ],
+    }
+
+    target_onto = {
+        "entity2iri": {
+            "t1": "http://target.org/1",
+            "t2": "http://target.org/2",
+            "t3": "http://target.org/3",
+        },
+        "triplets": [
+            ("t1", "relatedTo", "t2"),
+            ("t2", "relatedTo", "t3"),
+        ],
+    }
+
+    return source_onto, target_onto
+
+
+def test_retriever_topk_output(toy_retriever_ontologies):
+    source_onto, target_onto = toy_retriever_ontologies
+
+    class DummyAligner(GraphEmbeddingAligner):
+        model = "TransE"   # keep it lightweight for test
+
+    # retriever=True → returns top-k candidates
+    aligner = DummyAligner(retriever=True, top_k=2, num_epochs=1, embedding_dim=16)
+
+    results = aligner.generate([source_onto, target_onto])
+
+    # Check output format
+    assert isinstance(results, list)
+    assert all("source" in match for match in results)
+    assert all("target-cands" in match for match in results)
+    assert all("score-cands" in match for match in results)
+
+    # Check top-k constraint
+    for match in results:
+        assert len(match["target-cands"]) == 2
+        assert len(match["score-cands"]) == 2
+        assert all(isinstance(score, float) for score in match["score-cands"])
+
+
+@pytest.fixture
+def toy_kge_ontologies():
+    source_onto = {
+        "entity2iri": {
+            "s1": "http://source.org/1",
+            "s2": "http://source.org/2",
+        },
+        "triplets": [
+            ("s1", "relatedTo", "s2")
+        ],
+    }
+
+    target_onto = {
+        "entity2iri": {
+            "t1": "http://target.org/1",
+            "t2": "http://target.org/2",
+        },
+        "triplets": [
+            ("t1", "relatedTo", "t2")
+        ],
+    }
+
+    return source_onto, target_onto
+
+
+
+def test_kge_aligner_output(toy_kge_ontologies):
+    source_onto, target_onto = toy_kge_ontologies
+
+    class DummyAligner(GraphEmbeddingAligner):
+        model = "TransE"   # keep it light for testing
+
+    # retriever=False → one-to-one mapping
+    aligner = DummyAligner(retriever=False, num_epochs=1, embedding_dim=16)
+
+    results = aligner.generate([source_onto, target_onto])
+
+    # Check output format
+    assert isinstance(results, list)
+    assert all(isinstance(match, dict) for match in results)
+
+    # Check required keys
+    for match in results:
+        assert "source" in match
+        assert "target" in match
+        assert "score" in match
+
+        # Check value types
+        assert isinstance(match["source"], str)
+        assert isinstance(match["target"], str)
+        assert isinstance(match["score"], float)