sciknoworg
diff --git a/‎MAINTANANCE.md‎ b/‎MAINTANANCE.md‎
diff --git a/‎README.md‎
Lines changed: 5 additions & 2 deletions b/‎README.md‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎docs/source/index.rst‎
Lines changed: 7 additions & 0 deletions b/‎docs/source/index.rst‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/source/text2onto/text2onto.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/text2onto/text2onto.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ontolearner/data_structure/data.py‎
Lines changed: 15 additions & 0 deletions b/‎ontolearner/data_structure/data.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎ontolearner/text2onto/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎ontolearner/text2onto/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ontolearner/text2onto/batchifier.py‎
Lines changed: 137 additions & 0 deletions b/‎ontolearner/text2onto/batchifier.py‎
Lines changed: 137 additions & 0 deletions
diff --git a/‎ontolearner/text2onto/splitter.py‎
Lines changed: 195 additions & 0 deletions b/‎ontolearner/text2onto/splitter.py‎
Lines changed: 195 additions & 0 deletions
@@ -2,13 +2,16 @@
   <img src="https://raw.githubusercontent.com/sciknoworg/OntoLearner/main/images/logo.png" alt="OntoLearner Logo"/>
 </div>
 
-<h3 align="center">OntoLearner: Ontology Learning Framework</h3>
+<h3 align="center">OntoLearner: Benchmarking Ontology Learning Framework</h3>
 
 <div align="center">
 
+[![PyPI version](https://badge.fury.io/py/OntoLearner.svg)](https://badge.fury.io/py/OntoLearner)
+[![PyPI Downloads](https://static.pepy.tech/badge/ontolearner)](https://pepy.tech/projects/ontolearner)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
-
+[![Documentation Status](https://app.readthedocs.org/projects/ontolearner/badge/)](https://ontolearner.readthedocs.io/)
+[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](MAINTANANCE.md)
 </div>
 
 ## 🤝 Contribution
 
@@ -15,6 +15,13 @@
    benchmarking/benchmark
 
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Text2Onto
+   :hidden:
+
+   text2onto/text2onto
+
 .. toctree::
    :maxdepth: 1
    :caption: Hugging Face Hub
 
@@ -0,0 +1,2 @@
+Text2Onto
+==================================
@@ -82,3 +82,18 @@ class OntologyData(BaseModel):
     term_typings: List[TermTyping] = Field(..., description="List of term typing entries")
     type_taxonomies: TypeTaxonomies = Field(..., description="Taxonomy information")
     type_non_taxonomic_relations: NonTaxonomicRelations = Field(..., description="Non-taxonomic relation information")
+
+class PseudoSentence(BaseModel):
+    id: str
+    pseudo_sentences: List[str]
+    terms: List[str]
+    types: List[str]
+
+
+class SyntheticText2OntoData(BaseModel):
+    """
+    Schema for synthetic text2onto generator
+    """
+    child_to_parent: Dict[str, List[str]] = Field(..., description="Mapping from child terms to their parent terms")
+    pseudo_sentences: List[PseudoSentence] = Field(..., description="List of pseudo sentence batches with metadata")
+    generated_docs: List[Document] = Field(..., description="Generated documents from pseudo sentences")
@@ -1 +1,2 @@
-from .general import * # NOQA
+from .synthesizer import SyntheticGenerator
+from .splitter import SyntheticDataSplitter
@@ -0,0 +1,137 @@
+from collections import defaultdict, deque
+from typing import Dict, List
+from abc import ABC
+
+from ..data_structure import PseudoSentence
+
+class TaxonomyBatchifier(ABC):
+    def __init__(self, parent_to_child: Dict[str, List], batch_size: int):
+        self.parent_to_child = parent_to_child
+        self.batch_size = batch_size
+        self.child_to_parents = self._build_child_to_parents()
+        self.visited_relation_ids = set()
+        self.relation_queue = deque()
+        self.raw_batches = []
+
+    def _build_child_to_parents(self):
+        child_to_parents = defaultdict(list)
+        for parent, children in self.parent_to_child.items():
+            for rel in children:
+                child_to_parents[rel[1]].append((parent, rel[0]))  # (parent label, relation ID)
+        return child_to_parents
+
+    def _find_leaf_nodes(self):
+        all_children = set()
+        for children in self.parent_to_child.values():
+            for child_info in children:
+                all_children.add(child_info[1])
+        return [child for child in all_children if child not in self.parent_to_child]
+
+    def _enqueue_parents(self, node):
+        for parent, rel_id in self.child_to_parents.get(node, []):
+            if rel_id not in self.visited_relation_ids:
+                self.relation_queue.append((parent, node, rel_id))
+
+    def _structure_aware_batching(self, bootstrap_nodes=None):
+        if bootstrap_nodes is None:
+            bootstrap_nodes = self._find_leaf_nodes()
+        elif len(bootstrap_nodes) == 0:
+            return
+
+        # Initialize queue from leaf nodes
+        for leaf in bootstrap_nodes:
+            for parent, rel_id in self.child_to_parents.get(leaf, []):
+                self.relation_queue.append((parent, leaf, rel_id))
+
+        while self.relation_queue:
+            parent, child, rel_id = self.relation_queue.popleft()
+            # if the relation is already included, skip it
+            if rel_id in self.visited_relation_ids:
+                continue
+
+            current_batch = []
+
+            # A: add children if any
+            for child_rel in self.parent_to_child.get(child, []):
+                child_id, grandchild, rel_label = child_rel
+                if child_id not in self.visited_relation_ids:
+                    current_batch.append((child, child_id, grandchild, rel_label))
+                    self.visited_relation_ids.add(child_id)
+                    self._enqueue_parents(child)
+
+            # A: Add all siblings
+            for sibling_rel in self.parent_to_child[parent]:
+                sibling_id, sibling, rel_label = sibling_rel
+                if sibling_id not in self.visited_relation_ids:
+                    current_batch.append((parent, sibling_id, sibling, rel_label))
+                    self.visited_relation_ids.add(sibling_id)
+
+                    # B: Add children of each sibling
+                    for child_rel in self.parent_to_child.get(sibling, []):
+                        child_id, grandchild, rel_label = child_rel
+                        if child_id not in self.visited_relation_ids:
+                            current_batch.append((sibling, child_id, grandchild, rel_label))
+                            self.visited_relation_ids.add(child_id)
+                            self._enqueue_parents(sibling)
+
+            if current_batch:
+                self.raw_batches.append(current_batch)
+
+        # C: Fallback for any missed relations
+        fallback_bootstrap_nodes = []
+        for parent, children in self.parent_to_child.items():
+            for rel in children:
+                if rel[0] not in self.visited_relation_ids:
+                    fallback_bootstrap_nodes.append(parent)
+                    fallback_bootstrap_nodes.append(rel[1])
+        self._structure_aware_batching(bootstrap_nodes=fallback_bootstrap_nodes)
+
+
+    def _split_batches_and_create_pseudo_sentences(self) -> List[PseudoSentence]:
+        verbalized_batches = []
+        id = 0
+        for batch in self.raw_batches:
+            verbalized_batch = []
+            # keeping track of pseudo sentences to avoid adding duplicates
+            pseudo_sentences = set()
+            for rel in batch:
+                pseudo_sentence = f"{rel[2]} {rel[3]} {rel[0]}"
+                # keeping track of terms and types included in each pseudo sentence
+                if rel[3] == "is a":
+                    terms = [rel[2]]
+                    types = [rel[0]]
+                else:
+                    terms = []
+                    types = [rel[0], rel[2]]
+                # skip duplicates
+                if pseudo_sentence not in pseudo_sentences:
+                    verbalized_batch.append([pseudo_sentence, terms, types])
+                    pseudo_sentences.add(pseudo_sentence)
+            verbalized_batches.append([id, verbalized_batch])
+            id += 1
+
+        final_batches = []
+        for id, batch in verbalized_batches:
+            sub_id = 0
+            for i in range(0, len(batch), self.batch_size):
+                final_batches.append([f"{id}_{sub_id}", batch[i:i + self.batch_size]])
+                sub_id += 1
+
+        final_batches_processed = []
+        for id, batch in final_batches:
+            doc_level_terms = set()
+            doc_level_types = set()
+            doc_level_pseudo_sentences = list()
+            for pseudo_sentence, sent_terms, sent_types in batch:
+                doc_level_pseudo_sentences.append(pseudo_sentence)
+                doc_level_terms.update(sent_terms)
+                doc_level_types.update(sent_types)
+            final_batches_processed.append(PseudoSentence(id=id,
+                                                          pseudo_sentences=doc_level_pseudo_sentences,
+                                                          terms=list(doc_level_terms),
+                                                          types=list(doc_level_types)))
+        return final_batches_processed
+
+    def batchify(self) -> List[PseudoSentence]:
+        self._structure_aware_batching()
+        return self._split_batches_and_create_pseudo_sentences()
@@ -0,0 +1,195 @@
+from collections import defaultdict, deque
+import pandas as pd
+from tqdm import tqdm
+import random
+from ..data_structure import SyntheticText2OntoData
+
+class SyntheticDataSplitter:
+
+    def __init__(self, synthetic_data: SyntheticText2OntoData, onto_name:str):
+        self.pseudo_sentence_batches = pd.DataFrame([ps.dict() for ps in synthetic_data.pseudo_sentences])
+        self.child_to_parent = synthetic_data.child_to_parent
+
+        self.documents = list()
+        self.term_to_doc_id = defaultdict(set)
+        self.type_to_doc_id = defaultdict(set)
+        self.doc_id_to_terms = defaultdict(set)
+        self.doc_id_to_types = defaultdict(set)
+        for row in tqdm(self.pseudo_sentence_batches.itertuples(index=False), total=len(self.pseudo_sentence_batches)):
+            doc_id = str(row.id)
+            self.doc_id_to_types[doc_id] = set(row.types)
+            self.doc_id_to_terms[doc_id] = set(row.terms)
+            for a_type in row.types:
+                self.type_to_doc_id[a_type].add(doc_id)
+            for a_term in row.terms:
+                self.term_to_doc_id[a_term].add(doc_id)
+
+        self.doc_id_to_doc = {doc.id: doc for doc in synthetic_data.generated_docs}
+        print(f"loaded {len(self.doc_id_to_doc)} documents!")
+
+        total_type_count = len(set().union(*self.doc_id_to_types.values()))
+        total_term_count = len(set().union(*self.doc_id_to_terms.values()))
+        print(f" total type count: {total_type_count}")
+        print(f" total term count: {total_term_count}")
+
+        self.onto_name = onto_name
+
+    def set_train_val_test_sizes(self, train_percentage: float = 0.8,
+                                 val_percentage: float = 0.1,
+                                 test_percentage: float = 0.1):
+        if train_percentage + val_percentage + test_percentage != 1:
+            raise Exception("The sum of train/val/test percentages should be 1.")
+        total_types = len(self.type_to_doc_id.keys())
+        total_docs = len(self.doc_id_to_doc.keys())
+        train_quota = int(train_percentage * total_types)
+        val_quota = int(val_percentage * total_types)
+        test_quota = total_types - train_quota - val_quota
+        print(f"train_quota: {train_quota}\nval_quota: {val_quota}\ntest_quota: {test_quota}")
+        split_targets = {
+            'train': train_quota,
+            'val': val_quota,
+            'test': test_quota
+        }
+        train_docs_quota = int(train_percentage * total_docs)
+        val_docs_quota = int(val_percentage * total_docs)
+        test_docs_quota = total_docs - train_docs_quota - val_docs_quota
+        print(
+            f"train docs quota: {train_docs_quota}\nval docs quota: {val_docs_quota}\ntest docs quota: {test_docs_quota}")
+        split_docs_targets = {
+            'train': train_docs_quota,
+            'val': val_docs_quota,
+            'test': test_docs_quota
+        }
+        return split_targets, split_docs_targets
+
+    def assign_types_with_propagation(self, split_name, split_targets, split_docs_targets,
+                                      split_types, split_docs, unassigned_types, unassigned_docs, assigned_docs):
+        target_size = split_targets[split_name]
+        docs_target_size = split_docs_targets[split_name]
+        while len(split_types[split_name]) < target_size and len(
+                split_docs[split_name]) < docs_target_size and unassigned_types:
+            type_seed = unassigned_types.pop()
+            queue = deque([type_seed])
+            while (queue and len(split_types[split_name]) < target_size and
+                   len(split_docs[split_name]) < docs_target_size):
+                current_type = queue.popleft()
+                if current_type in split_types['train'] | split_types['val'] | split_types['test']:
+                    continue
+                split_types[split_name].add(current_type)
+                # Get all documents for this type
+                for doc_id in self.type_to_doc_id.get(current_type, []):
+                    if doc_id in assigned_docs:
+                        continue
+                    split_docs[split_name].add(doc_id)
+                    assigned_docs.add(doc_id)
+                    unassigned_docs.discard(doc_id)
+                    for t in self.doc_id_to_types[doc_id]:
+                        if t not in split_types['train'] | split_types['val'] | split_types['test']:
+                            queue.append(t)
+                            unassigned_types.discard(t)
+        return split_types, split_docs, unassigned_docs, assigned_docs
+
+    def create_train_val_test_splits(self, split_targets, split_docs_targets):
+        split_types = {'train': set(), 'val': set(), 'test': set()}
+        split_docs = {'train': set(), 'val': set(), 'test': set()}
+        all_types = list(self.type_to_doc_id.keys())
+        random.seed(25)
+        random.shuffle(all_types)
+        unassigned_types = set(all_types)
+        unassigned_docs = set(self.doc_id_to_doc.keys())
+        assigned_docs = set()
+
+        for split_name in ['train', 'test', 'val']:
+            split_types, split_docs, unassigned_docs, assigned_docs = self.assign_types_with_propagation(split_name,
+                                                                                         split_targets,
+                                                                                         split_docs_targets,
+                                                                                         split_types,
+                                                                                         split_docs,
+                                                                                         unassigned_types,
+                                                                                         unassigned_docs,
+                                                                                         assigned_docs)
+
+        # assign the unassigned documents based on their overlap with types in the already assigned types to splits
+        for doc_id in unassigned_docs.copy():
+            doc_types = self.doc_id_to_types[doc_id]
+            doc_type_split_counts = {"train": 0, "test": 0, "val": 0}
+            for a_type in doc_types:
+                for split_name in ['train', 'test', 'val']:
+                    if a_type in split_types[split_name]:
+                        doc_type_split_counts[split_name] += 1
+
+            total = sum(doc_type_split_counts.values())
+            if total == 0:
+                split_docs["train"].add(doc_id)
+            else:
+                max_key = max(doc_type_split_counts, key=doc_type_split_counts.get)
+                split_docs[max_key].add(doc_id)
+            unassigned_docs.discard(doc_id)
+
+        assert len(unassigned_docs) == 0, "There are no unassigned documents."
+
+        print(f"Train: {len(split_docs['train'])} docs, {len(split_types['train'])} types")
+        print(f"Val: {len(split_docs['val'])} docs, {len(split_types['val'])} types")
+        print(f"Test: {len(split_docs['test'])} docs, {len(split_types['test'])} types")
+        return split_docs
+
+    def generate_split_artefacts(self, split_docs):
+        split_terms = {'train': set(), 'val': set(), 'test': set()}
+        terms_splits = {}
+        for split_name in ['train', 'val', 'test']:
+            for doc_id in split_docs[split_name]:
+                split_terms[split_name].update(self.doc_id_to_terms[doc_id])
+            split_terms[split_name] = list(split_terms[split_name])
+            terms_with_types = []
+            for term in split_terms[split_name]:
+                if term in self.child_to_parent:
+                    terms_with_types.append({"term": term, "types": self.child_to_parent[term]})
+                else:
+                    terms_with_types.append({"term": term, "types": []})
+            terms_splits[split_name] = terms_with_types
+
+        types_splits = {}
+        for split_name in ['train', 'val', 'test']:
+            split_types_from_docs = set()
+            for doc_id in split_docs[split_name]:
+                split_types_from_docs.update(self.doc_id_to_types[doc_id])
+            types_with_parents = []
+            for a_type in split_types_from_docs:
+                if a_type in self.child_to_parent:
+                    types_with_parents.append({"type": a_type, "parents": self.child_to_parent[a_type]})
+                else:
+                    types_with_parents.append({"type": a_type, "parents": []})
+            types_splits[split_name] = types_with_parents
+
+        docs_split = {'train': [], 'val': [], 'test': []}
+        split_to_text = {'train': "", 'val': "", 'test': ""}
+        for split_name in ['train', 'val', 'test']:
+            for doc_id in split_docs[split_name]:
+                doc = self.doc_id_to_doc[doc_id]
+                docs_split[split_name].append(doc)
+                split_to_text[split_name] += " " + doc.title + " " + doc.text
+
+        types2docs_splits = {}
+        for split_name in ['train', 'val', 'test']:
+            type2doc = defaultdict(list)
+            split_types_from_docs = set()
+            for doc_id in split_docs[split_name]:
+                split_types_from_docs.update(self.doc_id_to_types[doc_id])
+            for a_type in split_types_from_docs:
+                for doc_id in self.type_to_doc_id[a_type]:
+                    if doc_id in split_docs[split_name]:
+                        extraction_type = "abstractive"
+                        if a_type in split_to_text[split_name]:
+                            extraction_type = "extractive"
+                        type2doc[a_type].append({"doc_id": doc_id, "extraction_type": extraction_type})
+            types2docs_splits[split_name] = type2doc
+
+        return terms_splits, types_splits, docs_split, types2docs_splits
+
+    def split(self, train: float = 0.8, val: float = 0.1, test: float = 0.1):
+        split_targets, split_docs_targets = self.set_train_val_test_sizes(train_percentage=train,
+                                                                          val_percentage=val,
+                                                                          test_percentage=test)
+        split_docs = self.create_train_val_test_splits(split_targets, split_docs_targets)
+        terms, types, docs, types2docs = self.generate_split_artefacts(split_docs)
+        return terms, types, docs, types2docs
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Text2Onto`
	`2`	`+==================================`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-from .general import * # NOQA`
	`1`	`+from .synthesizer import SyntheticGenerator`
	`2`	`+from .splitter import SyntheticDataSplitter`