-
Notifications
You must be signed in to change notification settings - Fork 43
Expand file tree
/
Copy pathrelation_extraction.py
More file actions
135 lines (100 loc) · 3.96 KB
/
relation_extraction.py
File metadata and controls
135 lines (100 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import asyncio
import time
from typing import Optional
from nltk.corpus import wordnet as wn
from pydantic import BaseModel
from agentics import AG
from agentics.core.default_types import Astr
from agentics.core.transducible_functions import With, transducible
N_TERMS = 500
N_CLUSTERS = 5
# llm = "watsonx/openai/gpt-oss-120b"
llm = AG.get_llm_provider()
class InputTerms(BaseModel):
terms: list[str] = None
class Relation(BaseModel):
subject: Optional[str] = None
object: Optional[str] = None
relation_type: Optional[str] = None
class Ontology(BaseModel):
entities: Optional[list[str]] = None
relations: Optional[list[Relation]] = None
extract_relations = Ontology << With(
InputTerms,
instructions="Derive an Ontology from the input"
"objects by identifying relations among them",
verbose_transduction=False,
batch_size=20,
llm=llm,
)
@transducible(llm=llm)
async def relation_extracton(input_objects: InputTerms) -> Ontology:
ontology = await extract_relations(input_objects)
return ontology
async def relation_extracton_map_reduce(
input_objects: InputTerms, n_clusters: int = 10
) -> Ontology:
terms = AG(states=[Astr(term) for term in input_objects.terms])
clusters_ags = terms.cluster(n_partitions=n_clusters)
clusters = [
InputTerms(terms=[x.value for x in cluster.states]) for cluster in clusters_ags
]
ontologies = await extract_relations(clusters)
final_ontology = Ontology()
final_ontology.entities = list(
set([entity for ontology in ontologies for entity in (ontology.entities or [])])
)
final_ontology.relations = [
Relation(
subject=relation.subject,
relation_type=relation.relation_type,
object=relation.object,
)
for ontology in ontologies
for relation in (ontology.relations or [])
]
return final_ontology
async def evaluate_relation_extraction(outout_file: str = None, n_terms=10000):
terms = random_frequent_terms(n=n_terms)
# for i in range(1000, 1001, 500):
print(f"--- Processing {n_terms} instances ---")
# start_time = time.time()
# generate_terms = InputTerms << With(Astr, llm=llm, instructions=
# "Generate a list of financial terms")
# financial_terms = await generate_terms(Astr(f"Generate a list of {i} terms related to finance."))
# end_time = time.time()
# print(f"Term Generation Time: {end_time - start_time:.2f} seconds")
# used_terms = financial_terms
used_terms = InputTerms(terms=terms)
nclusters = n_terms // 50
# start_time = time.time()
print(f"Using {nclusters} clusters for map-reduce")
ontology = await relation_extracton_map_reduce(used_terms, n_clusters=nclusters)
# end_time = time.time()
# print(f"Run {i+1} - Map-Reduce Relation Extraction Time: {end_time - start_time:.2f} seconds")
print(
f"Extracted {len(ontology.entities or [])} entities and {len(ontology.relations or [])} relations"
)
print(ontology.model_dump_json(indent=2))
import random
def random_frequent_terms(n=50, min_freq=100, pos=wn.NOUN):
"""
Return up to `n` random lemma names from WordNet
whose frequency (cntlist.rev) is >= min_freq.
pos can be: wn.NOUN, wn.VERB, wn.ADJ, wn.ADV or None for all.
"""
frequent_lemmas = []
# Iterate over all lemma names (optionally filtered by POS)
for name in wn.all_lemma_names(pos=pos):
# Each lemma name can correspond to several Lemma objects (different synsets)
for lemma in wn.lemmas(name, pos=pos):
if lemma.count() >= min_freq:
frequent_lemmas.append(lemma)
if not frequent_lemmas:
return []
# Sample at most n lemmas
sampled_lemmas = random.sample(frequent_lemmas, k=min(n, len(frequent_lemmas)))
# If you just want the *words* (strings), dedupe names:
terms = sorted({lemma.name() for lemma in sampled_lemmas})
return terms
asyncio.run(evaluate_relation_extraction())