Skip to content

Commit af404ff

Browse files
committed
feature: add SplitWalker
1 parent 339fcc0 commit af404ff

File tree

2 files changed

+104
-0
lines changed

2 files changed

+104
-0
lines changed

pyrdf2vec/walkers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .anonymous import AnonymousWalker
99
from .halk import HALKWalker
1010
from .ngram import NGramWalker
11+
from .split import SplitWalker
1112
from .walklet import WalkletWalker
1213
from .weisfeiler_lehman import WLWalker
1314

@@ -17,6 +18,7 @@
1718
"HALKWalker",
1819
"NGramWalker",
1920
"RandomWalker",
21+
"SplitWalker",
2022
"Walker",
2123
"WalkletWalker",
2224
"WLWalker",

pyrdf2vec/walkers/split.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import os
2+
import re
3+
from typing import Set
4+
5+
import attr
6+
7+
from pyrdf2vec.graphs import KG, Vertex
8+
from pyrdf2vec.typings import EntityWalks, List, SWalk, Walk
9+
from pyrdf2vec.walkers import RandomWalker
10+
11+
12+
@attr.s
13+
class SplitWalker(RandomWalker):
14+
"""Splitting walking strategy which splits each vertex (except the root
15+
node) present in the randomly extracted walks.
16+
17+
Attributes:
18+
func_split: The function to call for the splitting of vertices. In case
19+
of reimplementation, it is important to respect the signature
20+
imposed by `basic_split` function.
21+
"""
22+
23+
func_split = attr.ib(kw_only=True, default=None, repr=False)
24+
25+
def __attrs_post_init__(self):
26+
if self.func_split is None:
27+
self.func_split = self.basic_split
28+
29+
def basic_split(self, walks: List[Walk]) -> Set[SWalk]:
30+
"""Splits vertices of random walks for an entity based. To achieve
31+
this, each vertex (except the root node) is split according to symbols
32+
and capitalization by removing any duplication.
33+
34+
Some examples:
35+
('http://dl-learner.org/carcinogenesis#d19'),
36+
'http://dl-learner.org/carcinogenesis#hasBond'),
37+
'http://dl-learner.org/carcinogenesis#bond3209')
38+
39+
-> ('http://dl-learner.org/carcinogenesis#d19', 'has', 'bond', '3209')
40+
41+
('http://dl-learner.org/carcinogenesis#d19'),
42+
'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
43+
'http://dl-learner.org/carcinogenesis#Compound')
44+
45+
-> ('http://dl-learner.org/carcinogenesis#d19', 'type', 'compound', 'class')
46+
47+
Args:
48+
walks: The random extracted walks.
49+
50+
Returns:
51+
The list of tuples that contains split walks.
52+
53+
"""
54+
canonical_walks: Set[SWalk] = set()
55+
for walk in walks:
56+
canonical_walk = [walk[0].name]
57+
for i, _ in enumerate(walk[1::], 1):
58+
vertices = []
59+
if "http" in walk[i].name:
60+
vertices = " ".join(re.split("[#]", walk[i].name)).split()
61+
if i % 2 == 1:
62+
name = vertices[1] if vertices else walk[i].name
63+
preds = [
64+
sub_name
65+
for sub_name in re.split(r"([A-Z][a-z]*)", name)
66+
if sub_name
67+
]
68+
for pred in preds:
69+
canonical_walk += [pred.lower()]
70+
else:
71+
name = vertices[-1] if vertices else walk[i].name
72+
objs = []
73+
try:
74+
objs = [str(float(name))]
75+
except ValueError:
76+
objs = re.sub("[^A-Za-z0-9]+", " ", name).split()
77+
if len(objs) == 1:
78+
match = re.match(
79+
r"([a-z]+)([0-9]+)", objs[0], re.I
80+
)
81+
if match:
82+
objs = list(match.groups())
83+
for obj in objs:
84+
canonical_walk += [obj.lower()]
85+
canonical_walk = list(dict(zip(canonical_walk, canonical_walk)))
86+
canonical_walks.add(tuple(canonical_walk))
87+
return canonical_walks
88+
89+
def _extract(self, kg: KG, entity: Vertex) -> EntityWalks:
90+
"""Extracts random walks for an entity based on a Knowledge Graph.
91+
92+
Args:
93+
kg: The Knowledge Graph.
94+
entity: The root node to extract walks.
95+
96+
Returns:
97+
A dictionary having the entity as key and a list of tuples as value
98+
corresponding to the extracted walks.
99+
100+
"""
101+
walks = self.extract_walks(kg, entity)
102+
return {entity.name: list(self.func_split(walks))}

0 commit comments

Comments
 (0)