Skip to content

Commit ef313aa

Browse files
authored
Merge pull request #221 from sciknoworg/dev
Added text2onto with minor changes to repo docs
2 parents 78b7bc5 + ea51d4f commit ef313aa

File tree

13 files changed

+532
-7
lines changed

13 files changed

+532
-7
lines changed

MAINTANANCE.md

Whitespace-only changes.

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
<img src="https://raw.githubusercontent.com/sciknoworg/OntoLearner/main/images/logo.png" alt="OntoLearner Logo"/>
33
</div>
44

5-
<h3 align="center">OntoLearner: Ontology Learning Framework</h3>
5+
<h3 align="center">OntoLearner: Benchmarking Ontology Learning Framework</h3>
66

77
<div align="center">
88

9+
[![PyPI version](https://badge.fury.io/py/OntoLearner.svg)](https://badge.fury.io/py/OntoLearner)
10+
[![PyPI Downloads](https://static.pepy.tech/badge/ontolearner)](https://pepy.tech/projects/ontolearner)
911
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
1012
[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
11-
13+
[![Documentation Status](https://app.readthedocs.org/projects/ontolearner/badge/)](https://ontolearner.readthedocs.io/)
14+
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](MAINTANANCE.md)
1215
</div>
1316

1417
## 🤝 Contribution

docs/source/index.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@
1515
benchmarking/benchmark
1616

1717

18+
.. toctree::
19+
:maxdepth: 1
20+
:caption: Text2Onto
21+
:hidden:
22+
23+
text2onto/text2onto
24+
1825
.. toctree::
1926
:maxdepth: 1
2027
:caption: Hugging Face Hub
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Text2Onto
2+
==================================

ontolearner/data_structure/data.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,18 @@ class OntologyData(BaseModel):
8282
term_typings: List[TermTyping] = Field(..., description="List of term typing entries")
8383
type_taxonomies: TypeTaxonomies = Field(..., description="Taxonomy information")
8484
type_non_taxonomic_relations: NonTaxonomicRelations = Field(..., description="Non-taxonomic relation information")
85+
86+
class PseudoSentence(BaseModel):
87+
id: str
88+
pseudo_sentences: List[str]
89+
terms: List[str]
90+
types: List[str]
91+
92+
93+
class SyntheticText2OntoData(BaseModel):
94+
"""
95+
Schema for synthetic text2onto generator
96+
"""
97+
child_to_parent: Dict[str, List[str]] = Field(..., description="Mapping from child terms to their parent terms")
98+
pseudo_sentences: List[PseudoSentence] = Field(..., description="List of pseudo sentence batches with metadata")
99+
generated_docs: List[Document] = Field(..., description="Generated documents from pseudo sentences")

ontolearner/text2onto/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from .general import * # NOQA
1+
from .synthesizer import SyntheticGenerator
2+
from .splitter import SyntheticDataSplitter
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from collections import defaultdict, deque
2+
from typing import Dict, List
3+
from abc import ABC
4+
5+
from ..data_structure import PseudoSentence
6+
7+
class TaxonomyBatchifier(ABC):
8+
def __init__(self, parent_to_child: Dict[str, List], batch_size: int):
9+
self.parent_to_child = parent_to_child
10+
self.batch_size = batch_size
11+
self.child_to_parents = self._build_child_to_parents()
12+
self.visited_relation_ids = set()
13+
self.relation_queue = deque()
14+
self.raw_batches = []
15+
16+
def _build_child_to_parents(self):
17+
child_to_parents = defaultdict(list)
18+
for parent, children in self.parent_to_child.items():
19+
for rel in children:
20+
child_to_parents[rel[1]].append((parent, rel[0])) # (parent label, relation ID)
21+
return child_to_parents
22+
23+
def _find_leaf_nodes(self):
24+
all_children = set()
25+
for children in self.parent_to_child.values():
26+
for child_info in children:
27+
all_children.add(child_info[1])
28+
return [child for child in all_children if child not in self.parent_to_child]
29+
30+
def _enqueue_parents(self, node):
31+
for parent, rel_id in self.child_to_parents.get(node, []):
32+
if rel_id not in self.visited_relation_ids:
33+
self.relation_queue.append((parent, node, rel_id))
34+
35+
def _structure_aware_batching(self, bootstrap_nodes=None):
36+
if bootstrap_nodes is None:
37+
bootstrap_nodes = self._find_leaf_nodes()
38+
elif len(bootstrap_nodes) == 0:
39+
return
40+
41+
# Initialize queue from leaf nodes
42+
for leaf in bootstrap_nodes:
43+
for parent, rel_id in self.child_to_parents.get(leaf, []):
44+
self.relation_queue.append((parent, leaf, rel_id))
45+
46+
while self.relation_queue:
47+
parent, child, rel_id = self.relation_queue.popleft()
48+
# if the relation is already included, skip it
49+
if rel_id in self.visited_relation_ids:
50+
continue
51+
52+
current_batch = []
53+
54+
# A: add children if any
55+
for child_rel in self.parent_to_child.get(child, []):
56+
child_id, grandchild, rel_label = child_rel
57+
if child_id not in self.visited_relation_ids:
58+
current_batch.append((child, child_id, grandchild, rel_label))
59+
self.visited_relation_ids.add(child_id)
60+
self._enqueue_parents(child)
61+
62+
# A: Add all siblings
63+
for sibling_rel in self.parent_to_child[parent]:
64+
sibling_id, sibling, rel_label = sibling_rel
65+
if sibling_id not in self.visited_relation_ids:
66+
current_batch.append((parent, sibling_id, sibling, rel_label))
67+
self.visited_relation_ids.add(sibling_id)
68+
69+
# B: Add children of each sibling
70+
for child_rel in self.parent_to_child.get(sibling, []):
71+
child_id, grandchild, rel_label = child_rel
72+
if child_id not in self.visited_relation_ids:
73+
current_batch.append((sibling, child_id, grandchild, rel_label))
74+
self.visited_relation_ids.add(child_id)
75+
self._enqueue_parents(sibling)
76+
77+
if current_batch:
78+
self.raw_batches.append(current_batch)
79+
80+
# C: Fallback for any missed relations
81+
fallback_bootstrap_nodes = []
82+
for parent, children in self.parent_to_child.items():
83+
for rel in children:
84+
if rel[0] not in self.visited_relation_ids:
85+
fallback_bootstrap_nodes.append(parent)
86+
fallback_bootstrap_nodes.append(rel[1])
87+
self._structure_aware_batching(bootstrap_nodes=fallback_bootstrap_nodes)
88+
89+
90+
def _split_batches_and_create_pseudo_sentences(self) -> List[PseudoSentence]:
91+
verbalized_batches = []
92+
id = 0
93+
for batch in self.raw_batches:
94+
verbalized_batch = []
95+
# keeping track of pseudo sentences to avoid adding duplicates
96+
pseudo_sentences = set()
97+
for rel in batch:
98+
pseudo_sentence = f"{rel[2]} {rel[3]} {rel[0]}"
99+
# keeping track of terms and types included in each pseudo sentence
100+
if rel[3] == "is a":
101+
terms = [rel[2]]
102+
types = [rel[0]]
103+
else:
104+
terms = []
105+
types = [rel[0], rel[2]]
106+
# skip duplicates
107+
if pseudo_sentence not in pseudo_sentences:
108+
verbalized_batch.append([pseudo_sentence, terms, types])
109+
pseudo_sentences.add(pseudo_sentence)
110+
verbalized_batches.append([id, verbalized_batch])
111+
id += 1
112+
113+
final_batches = []
114+
for id, batch in verbalized_batches:
115+
sub_id = 0
116+
for i in range(0, len(batch), self.batch_size):
117+
final_batches.append([f"{id}_{sub_id}", batch[i:i + self.batch_size]])
118+
sub_id += 1
119+
120+
final_batches_processed = []
121+
for id, batch in final_batches:
122+
doc_level_terms = set()
123+
doc_level_types = set()
124+
doc_level_pseudo_sentences = list()
125+
for pseudo_sentence, sent_terms, sent_types in batch:
126+
doc_level_pseudo_sentences.append(pseudo_sentence)
127+
doc_level_terms.update(sent_terms)
128+
doc_level_types.update(sent_types)
129+
final_batches_processed.append(PseudoSentence(id=id,
130+
pseudo_sentences=doc_level_pseudo_sentences,
131+
terms=list(doc_level_terms),
132+
types=list(doc_level_types)))
133+
return final_batches_processed
134+
135+
def batchify(self) -> List[PseudoSentence]:
136+
self._structure_aware_batching()
137+
return self._split_batches_and_create_pseudo_sentences()

ontolearner/text2onto/splitter.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
from collections import defaultdict, deque
2+
import pandas as pd
3+
from tqdm import tqdm
4+
import random
5+
from ..data_structure import SyntheticText2OntoData
6+
7+
class SyntheticDataSplitter:
8+
9+
def __init__(self, synthetic_data: SyntheticText2OntoData, onto_name:str):
10+
self.pseudo_sentence_batches = pd.DataFrame([ps.dict() for ps in synthetic_data.pseudo_sentences])
11+
self.child_to_parent = synthetic_data.child_to_parent
12+
13+
self.documents = list()
14+
self.term_to_doc_id = defaultdict(set)
15+
self.type_to_doc_id = defaultdict(set)
16+
self.doc_id_to_terms = defaultdict(set)
17+
self.doc_id_to_types = defaultdict(set)
18+
for row in tqdm(self.pseudo_sentence_batches.itertuples(index=False), total=len(self.pseudo_sentence_batches)):
19+
doc_id = str(row.id)
20+
self.doc_id_to_types[doc_id] = set(row.types)
21+
self.doc_id_to_terms[doc_id] = set(row.terms)
22+
for a_type in row.types:
23+
self.type_to_doc_id[a_type].add(doc_id)
24+
for a_term in row.terms:
25+
self.term_to_doc_id[a_term].add(doc_id)
26+
27+
self.doc_id_to_doc = {doc.id: doc for doc in synthetic_data.generated_docs}
28+
print(f"loaded {len(self.doc_id_to_doc)} documents!")
29+
30+
total_type_count = len(set().union(*self.doc_id_to_types.values()))
31+
total_term_count = len(set().union(*self.doc_id_to_terms.values()))
32+
print(f" total type count: {total_type_count}")
33+
print(f" total term count: {total_term_count}")
34+
35+
self.onto_name = onto_name
36+
37+
def set_train_val_test_sizes(self, train_percentage: float = 0.8,
38+
val_percentage: float = 0.1,
39+
test_percentage: float = 0.1):
40+
if train_percentage + val_percentage + test_percentage != 1:
41+
raise Exception("The sum of train/val/test percentages should be 1.")
42+
total_types = len(self.type_to_doc_id.keys())
43+
total_docs = len(self.doc_id_to_doc.keys())
44+
train_quota = int(train_percentage * total_types)
45+
val_quota = int(val_percentage * total_types)
46+
test_quota = total_types - train_quota - val_quota
47+
print(f"train_quota: {train_quota}\nval_quota: {val_quota}\ntest_quota: {test_quota}")
48+
split_targets = {
49+
'train': train_quota,
50+
'val': val_quota,
51+
'test': test_quota
52+
}
53+
train_docs_quota = int(train_percentage * total_docs)
54+
val_docs_quota = int(val_percentage * total_docs)
55+
test_docs_quota = total_docs - train_docs_quota - val_docs_quota
56+
print(
57+
f"train docs quota: {train_docs_quota}\nval docs quota: {val_docs_quota}\ntest docs quota: {test_docs_quota}")
58+
split_docs_targets = {
59+
'train': train_docs_quota,
60+
'val': val_docs_quota,
61+
'test': test_docs_quota
62+
}
63+
return split_targets, split_docs_targets
64+
65+
def assign_types_with_propagation(self, split_name, split_targets, split_docs_targets,
66+
split_types, split_docs, unassigned_types, unassigned_docs, assigned_docs):
67+
target_size = split_targets[split_name]
68+
docs_target_size = split_docs_targets[split_name]
69+
while len(split_types[split_name]) < target_size and len(
70+
split_docs[split_name]) < docs_target_size and unassigned_types:
71+
type_seed = unassigned_types.pop()
72+
queue = deque([type_seed])
73+
while (queue and len(split_types[split_name]) < target_size and
74+
len(split_docs[split_name]) < docs_target_size):
75+
current_type = queue.popleft()
76+
if current_type in split_types['train'] | split_types['val'] | split_types['test']:
77+
continue
78+
split_types[split_name].add(current_type)
79+
# Get all documents for this type
80+
for doc_id in self.type_to_doc_id.get(current_type, []):
81+
if doc_id in assigned_docs:
82+
continue
83+
split_docs[split_name].add(doc_id)
84+
assigned_docs.add(doc_id)
85+
unassigned_docs.discard(doc_id)
86+
for t in self.doc_id_to_types[doc_id]:
87+
if t not in split_types['train'] | split_types['val'] | split_types['test']:
88+
queue.append(t)
89+
unassigned_types.discard(t)
90+
return split_types, split_docs, unassigned_docs, assigned_docs
91+
92+
def create_train_val_test_splits(self, split_targets, split_docs_targets):
93+
split_types = {'train': set(), 'val': set(), 'test': set()}
94+
split_docs = {'train': set(), 'val': set(), 'test': set()}
95+
all_types = list(self.type_to_doc_id.keys())
96+
random.seed(25)
97+
random.shuffle(all_types)
98+
unassigned_types = set(all_types)
99+
unassigned_docs = set(self.doc_id_to_doc.keys())
100+
assigned_docs = set()
101+
102+
for split_name in ['train', 'test', 'val']:
103+
split_types, split_docs, unassigned_docs, assigned_docs = self.assign_types_with_propagation(split_name,
104+
split_targets,
105+
split_docs_targets,
106+
split_types,
107+
split_docs,
108+
unassigned_types,
109+
unassigned_docs,
110+
assigned_docs)
111+
112+
# assign the unassigned documents based on their overlap with types in the already assigned types to splits
113+
for doc_id in unassigned_docs.copy():
114+
doc_types = self.doc_id_to_types[doc_id]
115+
doc_type_split_counts = {"train": 0, "test": 0, "val": 0}
116+
for a_type in doc_types:
117+
for split_name in ['train', 'test', 'val']:
118+
if a_type in split_types[split_name]:
119+
doc_type_split_counts[split_name] += 1
120+
121+
total = sum(doc_type_split_counts.values())
122+
if total == 0:
123+
split_docs["train"].add(doc_id)
124+
else:
125+
max_key = max(doc_type_split_counts, key=doc_type_split_counts.get)
126+
split_docs[max_key].add(doc_id)
127+
unassigned_docs.discard(doc_id)
128+
129+
assert len(unassigned_docs) == 0, "There are no unassigned documents."
130+
131+
print(f"Train: {len(split_docs['train'])} docs, {len(split_types['train'])} types")
132+
print(f"Val: {len(split_docs['val'])} docs, {len(split_types['val'])} types")
133+
print(f"Test: {len(split_docs['test'])} docs, {len(split_types['test'])} types")
134+
return split_docs
135+
136+
def generate_split_artefacts(self, split_docs):
137+
split_terms = {'train': set(), 'val': set(), 'test': set()}
138+
terms_splits = {}
139+
for split_name in ['train', 'val', 'test']:
140+
for doc_id in split_docs[split_name]:
141+
split_terms[split_name].update(self.doc_id_to_terms[doc_id])
142+
split_terms[split_name] = list(split_terms[split_name])
143+
terms_with_types = []
144+
for term in split_terms[split_name]:
145+
if term in self.child_to_parent:
146+
terms_with_types.append({"term": term, "types": self.child_to_parent[term]})
147+
else:
148+
terms_with_types.append({"term": term, "types": []})
149+
terms_splits[split_name] = terms_with_types
150+
151+
types_splits = {}
152+
for split_name in ['train', 'val', 'test']:
153+
split_types_from_docs = set()
154+
for doc_id in split_docs[split_name]:
155+
split_types_from_docs.update(self.doc_id_to_types[doc_id])
156+
types_with_parents = []
157+
for a_type in split_types_from_docs:
158+
if a_type in self.child_to_parent:
159+
types_with_parents.append({"type": a_type, "parents": self.child_to_parent[a_type]})
160+
else:
161+
types_with_parents.append({"type": a_type, "parents": []})
162+
types_splits[split_name] = types_with_parents
163+
164+
docs_split = {'train': [], 'val': [], 'test': []}
165+
split_to_text = {'train': "", 'val': "", 'test': ""}
166+
for split_name in ['train', 'val', 'test']:
167+
for doc_id in split_docs[split_name]:
168+
doc = self.doc_id_to_doc[doc_id]
169+
docs_split[split_name].append(doc)
170+
split_to_text[split_name] += " " + doc.title + " " + doc.text
171+
172+
types2docs_splits = {}
173+
for split_name in ['train', 'val', 'test']:
174+
type2doc = defaultdict(list)
175+
split_types_from_docs = set()
176+
for doc_id in split_docs[split_name]:
177+
split_types_from_docs.update(self.doc_id_to_types[doc_id])
178+
for a_type in split_types_from_docs:
179+
for doc_id in self.type_to_doc_id[a_type]:
180+
if doc_id in split_docs[split_name]:
181+
extraction_type = "abstractive"
182+
if a_type in split_to_text[split_name]:
183+
extraction_type = "extractive"
184+
type2doc[a_type].append({"doc_id": doc_id, "extraction_type": extraction_type})
185+
types2docs_splits[split_name] = type2doc
186+
187+
return terms_splits, types_splits, docs_split, types2docs_splits
188+
189+
def split(self, train: float = 0.8, val: float = 0.1, test: float = 0.1):
190+
split_targets, split_docs_targets = self.set_train_val_test_sizes(train_percentage=train,
191+
val_percentage=val,
192+
test_percentage=test)
193+
split_docs = self.create_train_val_test_splits(split_targets, split_docs_targets)
194+
terms, types, docs, types2docs = self.generate_split_artefacts(split_docs)
195+
return terms, types, docs, types2docs

0 commit comments

Comments
 (0)