|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +from typing import Literal |
| 4 | + |
| 5 | +import ibis |
| 6 | + |
| 7 | +from mismo._resolve import IntoValueResolver, value_resolver |
| 8 | +from mismo.linkage import _linkage |
| 9 | +from mismo.linker import _common, _join_linker |
| 10 | + |
| 11 | + |
| 12 | +class UnnestLinker(_common.Linker): |
| 13 | + """A [Linker][mismo.Linker] that unnests a column before linking. |
| 14 | +
|
| 15 | + This is useful if you records with sets of tokens that you want to link on, |
| 16 | + for example: |
| 17 | + - splitting names into words/tokens and linking where any token matches. |
| 18 | + - tags, such as product categories, where you want to link where any tag matches. |
| 19 | +
|
| 20 | + This links where ANY of the unnested values match. |
| 21 | +
|
| 22 | + Examples |
| 23 | + -------- |
| 24 | + >>> import ibis |
| 25 | + >>> from ibis import _ |
| 26 | + >>> import mismo |
| 27 | + >>> ibis.options.interactive = True |
| 28 | + >>> linkage = mismo.playdata.load_patents() |
| 29 | + >>> t = linkage.left.select("record_id", "name") |
| 30 | + >>> t.head() |
| 31 | + ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ |
| 32 | + ┃ record_id ┃ name ┃ |
| 33 | + ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ |
| 34 | + │ uint32 │ string │ |
| 35 | + ├───────────┼──────────────────────────────┤ |
| 36 | + │ 2909 │ * AGILENT TECHNOLOGIES, INC. │ |
| 37 | + │ 3574 │ * AKZO NOBEL N.V. │ |
| 38 | + │ 3575 │ * AKZO NOBEL NV │ |
| 39 | + │ 3779 │ * ALCATEL N.V. │ |
| 40 | + │ 3780 │ * ALCATEL N.V. │ |
| 41 | + └───────────┴──────────────────────────────┘ |
| 42 | +
|
| 43 | + >>> tokens = _.name.upper().split(" ").filter(lambda x: x.length() > 4) |
| 44 | + >>> t.select(tokens.name("tokens")) |
| 45 | + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ |
| 46 | + ┃ tokens ┃ |
| 47 | + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ |
| 48 | + │ array<string> │ |
| 49 | + ├──────────────────────────────┤ |
| 50 | + │ ['AGILENT', 'TECHNOLOGIES,'] │ |
| 51 | + │ ['NOBEL'] │ |
| 52 | + │ ['NOBEL'] │ |
| 53 | + │ ['ALCATEL'] │ |
| 54 | + │ ['ALCATEL'] │ |
| 55 | + │ ['ALCATEL'] │ |
| 56 | + │ ['CANON', 'EUROPA'] │ |
| 57 | + │ ['CANON', 'EUROPA'] │ |
| 58 | + │ ['CANON', 'EUROPA'] │ |
| 59 | + │ [] │ |
| 60 | + │ … │ |
| 61 | + └──────────────────────────────┘ |
| 62 | +
|
| 63 | + Now, block the tables together wherever two records share a token. |
| 64 | + Note that this blocked `* SCHLUMBERGER LIMITED` with `* SCHLUMBERGER TECHNOLOGY BV`. |
| 65 | + because they both share the `SCHLUMBERGER` token. |
| 66 | +
|
| 67 | + >>> linker = mismo.KeyLinker(tokens.unnest()) |
| 68 | + >>> linker(t, t).links.filter(_.name_l != _.name_r).order_by( |
| 69 | + ... "record_id_l", "record_id_r" |
| 70 | + ... ).head() # doctest: +SKIP |
| 71 | + ┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ |
| 72 | + ┃ record_id_l ┃ record_id_r ┃ latitude_l ┃ latitude_r ┃ name_l ┃ name_r ┃ |
| 73 | + ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ |
| 74 | + │ int64 │ int64 │ float64 │ float64 │ string │ string │ |
| 75 | + ├─────────────┼─────────────┼────────────┼────────────┼────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────┤ |
| 76 | + │ 2909 │ 13390969 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V │ |
| 77 | + │ 2909 │ 13390970 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V. │ |
| 78 | + │ 2909 │ 13391015 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherland B.V. │ |
| 79 | + │ 2909 │ 13391055 │ 0.0 │ 52.50 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │ |
| 80 | + │ 2909 │ 13391056 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │ |
| 81 | + └─────────────┴─────────────┴────────────┴────────────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘ |
| 82 | + """ |
| 83 | + |
| 84 | + def __init__( |
| 85 | + self, |
| 86 | + column: IntoValueResolver, |
| 87 | + *, |
| 88 | + task: Literal["dedupe", "link"] | None = None, |
| 89 | + ): |
| 90 | + self.column_resolver = value_resolver(column) |
| 91 | + self.task = task |
| 92 | + self._linker = _join_linker.JoinLinker(self.column_resolver, task=task) |
| 93 | + |
| 94 | + def __call__(self, left: ibis.Table, right: ibis.Table) -> _linkage.Linkage: |
| 95 | + resolved_left = self.column_resolver(left) |
| 96 | + resolved_right = self.column_resolver(right) |
| 97 | + left = left.mutate(resolved_left.unnest().name(resolved_left.get_name())) |
| 98 | + right = right.mutate(resolved_right.unnest().name(resolved_right.get_name())) |
| 99 | + return self._linker.__call__(left, right) |
0 commit comments