|
| 1 | +import os |
| 2 | +import re |
| 3 | +from typing import Set |
| 4 | + |
| 5 | +import attr |
| 6 | + |
| 7 | +from pyrdf2vec.graphs import KG, Vertex |
| 8 | +from pyrdf2vec.typings import EntityWalks, List, SWalk, Walk |
| 9 | +from pyrdf2vec.walkers import RandomWalker |
| 10 | + |
| 11 | + |
| 12 | +@attr.s |
| 13 | +class SplitWalker(RandomWalker): |
| 14 | + """Splitting walking strategy which splits each vertex (except the root |
| 15 | + node) present in the randomly extracted walks. |
| 16 | +
|
| 17 | + Attributes: |
| 18 | + func_split: The function to call for the splitting of vertices. In case |
| 19 | + of reimplementation, it is important to respect the signature |
| 20 | + imposed by `basic_split` function. |
| 21 | + """ |
| 22 | + |
| 23 | + func_split = attr.ib(kw_only=True, default=None, repr=False) |
| 24 | + |
| 25 | + def __attrs_post_init__(self): |
| 26 | + if self.func_split is None: |
| 27 | + self.func_split = self.basic_split |
| 28 | + |
| 29 | + def basic_split(self, walks: List[Walk]) -> Set[SWalk]: |
| 30 | + """Splits vertices of random walks for an entity based. To achieve |
| 31 | + this, each vertex (except the root node) is split according to symbols |
| 32 | + and capitalization by removing any duplication. |
| 33 | +
|
| 34 | + Some examples: |
| 35 | + ('http://dl-learner.org/carcinogenesis#d19'), |
| 36 | + 'http://dl-learner.org/carcinogenesis#hasBond'), |
| 37 | + 'http://dl-learner.org/carcinogenesis#bond3209') |
| 38 | +
|
| 39 | + -> ('http://dl-learner.org/carcinogenesis#d19', 'has', 'bond', '3209') |
| 40 | +
|
| 41 | + ('http://dl-learner.org/carcinogenesis#d19'), |
| 42 | + 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', |
| 43 | + 'http://dl-learner.org/carcinogenesis#Compound') |
| 44 | +
|
| 45 | + -> ('http://dl-learner.org/carcinogenesis#d19', 'type', 'compound', 'class') |
| 46 | +
|
| 47 | + Args: |
| 48 | + walks: The random extracted walks. |
| 49 | +
|
| 50 | + Returns: |
| 51 | + The list of tuples that contains split walks. |
| 52 | +
|
| 53 | + """ |
| 54 | + canonical_walks: Set[SWalk] = set() |
| 55 | + for walk in walks: |
| 56 | + canonical_walk = [walk[0].name] |
| 57 | + for i, _ in enumerate(walk[1::], 1): |
| 58 | + vertices = [] |
| 59 | + if "http" in walk[i].name: |
| 60 | + vertices = " ".join(re.split("[#]", walk[i].name)).split() |
| 61 | + if i % 2 == 1: |
| 62 | + name = vertices[1] if vertices else walk[i].name |
| 63 | + preds = [ |
| 64 | + sub_name |
| 65 | + for sub_name in re.split(r"([A-Z][a-z]*)", name) |
| 66 | + if sub_name |
| 67 | + ] |
| 68 | + for pred in preds: |
| 69 | + canonical_walk += [pred.lower()] |
| 70 | + else: |
| 71 | + name = vertices[-1] if vertices else walk[i].name |
| 72 | + objs = [] |
| 73 | + try: |
| 74 | + objs = [str(float(name))] |
| 75 | + except ValueError: |
| 76 | + objs = re.sub("[^A-Za-z0-9]+", " ", name).split() |
| 77 | + if len(objs) == 1: |
| 78 | + match = re.match( |
| 79 | + r"([a-z]+)([0-9]+)", objs[0], re.I |
| 80 | + ) |
| 81 | + if match: |
| 82 | + objs = list(match.groups()) |
| 83 | + for obj in objs: |
| 84 | + canonical_walk += [obj.lower()] |
| 85 | + canonical_walk = list(dict(zip(canonical_walk, canonical_walk))) |
| 86 | + canonical_walks.add(tuple(canonical_walk)) |
| 87 | + return canonical_walks |
| 88 | + |
| 89 | + def _extract(self, kg: KG, entity: Vertex) -> EntityWalks: |
| 90 | + """Extracts random walks for an entity based on a Knowledge Graph. |
| 91 | +
|
| 92 | + Args: |
| 93 | + kg: The Knowledge Graph. |
| 94 | + entity: The root node to extract walks. |
| 95 | +
|
| 96 | + Returns: |
| 97 | + A dictionary having the entity as key and a list of tuples as value |
| 98 | + corresponding to the extracted walks. |
| 99 | +
|
| 100 | + """ |
| 101 | + walks = self.extract_walks(kg, entity) |
| 102 | + return {entity.name: list(self.func_split(walks))} |
0 commit comments