Skip to content

Commit 20f0049

Browse files
committed
unnest: support non-string columns in UnnestLinker, move to separate module
1 parent f264e48 commit 20f0049

File tree

3 files changed

+100
-84
lines changed

3 files changed

+100
-84
lines changed

mismo/linker/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
from mismo.linker._basic import EmptyLinker as EmptyLinker
44
from mismo.linker._basic import FullLinker as FullLinker
5-
from mismo.linker._basic import UnnestLinker as UnnestLinker
65
from mismo.linker._common import Linker as Linker
76
from mismo.linker._id_linker import IDLinker as IDLinker
87
from mismo.linker._join_linker import JoinLinker as JoinLinker
98
from mismo.linker._key_linker import KeyLinker as KeyLinker
109
from mismo.linker._or_linker import OrLinker as OrLinker
10+
from mismo.linker._unnest import UnnestLinker as UnnestLinker

mismo/linker/_basic.py

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -40,86 +40,3 @@ def __join_condition__(
4040

4141
def __call__(self, left: ibis.Table, right: ibis.Table) -> _linkage.Linkage:
4242
return self._linker(left, right)
43-
44-
45-
class UnnestLinker(_common.Linker):
46-
"""A [Linker][mismo.Linker] that unnests a column before linking.
47-
48-
This is useful if you records with sets of tokens that you want to link on,
49-
for example:
50-
- splitting names into words/tokens and linking where any token matches.
51-
- tags, such as product categories, where you want to link where any tag matches.
52-
53-
This links where ANY of the unnested values match.
54-
55-
Examples
56-
--------
57-
>>> import ibis
58-
>>> from ibis import _
59-
>>> import mismo
60-
>>> ibis.options.interactive = True
61-
>>> linkage = mismo.playdata.load_patents()
62-
>>> t = linkage.left.select("record_id", "name")
63-
>>> t.head()
64-
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
65-
┃ record_id ┃ name ┃
66-
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
67-
│ uint32 │ string │
68-
├───────────┼──────────────────────────────┤
69-
│ 2909 │ * AGILENT TECHNOLOGIES, INC. │
70-
│ 3574 │ * AKZO NOBEL N.V. │
71-
│ 3575 │ * AKZO NOBEL NV │
72-
│ 3779 │ * ALCATEL N.V. │
73-
│ 3780 │ * ALCATEL N.V. │
74-
└───────────┴──────────────────────────────┘
75-
76-
>>> tokens = _.name.upper().split(" ").filter(lambda x: x.length() > 4)
77-
>>> t.select(tokens.name("tokens"))
78-
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
79-
┃ tokens ┃
80-
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
81-
│ array<string> │
82-
├──────────────────────────────┤
83-
│ ['AGILENT', 'TECHNOLOGIES,'] │
84-
│ ['NOBEL'] │
85-
│ ['NOBEL'] │
86-
│ ['ALCATEL'] │
87-
│ ['ALCATEL'] │
88-
│ ['ALCATEL'] │
89-
│ ['CANON', 'EUROPA'] │
90-
│ ['CANON', 'EUROPA'] │
91-
│ ['CANON', 'EUROPA'] │
92-
│ [] │
93-
│ … │
94-
└──────────────────────────────┘
95-
96-
Now, block the tables together wherever two records share a token.
97-
Note that this blocked `* SCHLUMBERGER LIMITED` with `* SCHLUMBERGER TECHNOLOGY BV`.
98-
because they both share the `SCHLUMBERGER` token.
99-
100-
>>> linker = mismo.KeyLinker(tokens.unnest())
101-
>>> linker(t, t).links.filter(_.name_l != _.name_r).order_by(
102-
... "record_id_l", "record_id_r"
103-
... ).head() # doctest: +SKIP
104-
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
105-
┃ record_id_l ┃ record_id_r ┃ latitude_l ┃ latitude_r ┃ name_l ┃ name_r ┃
106-
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
107-
│ int64 │ int64 │ float64 │ float64 │ string │ string │
108-
├─────────────┼─────────────┼────────────┼────────────┼────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────┤
109-
│ 2909 │ 13390969 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V │
110-
│ 2909 │ 13390970 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V. │
111-
│ 2909 │ 13391015 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherland B.V. │
112-
│ 2909 │ 13391055 │ 0.0 │ 52.50 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
113-
│ 2909 │ 13391056 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
114-
└─────────────┴─────────────┴────────────┴────────────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
115-
"""
116-
117-
def __init__(self, column: str, *, task: Literal["dedupe", "link"] | None = None):
118-
self.column = column
119-
self.task = task
120-
self._linker = _join_linker.JoinLinker(self.column, task=task)
121-
122-
def __call__(self, left: ibis.Table, right: ibis.Table) -> _linkage.Linkage:
123-
left = left.mutate(left[self.column].unnest().name(self.column))
124-
right = left.mutate(right[self.column].unnest().name(self.column))
125-
return self._linker.__call__(left, right)

mismo/linker/_unnest.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
from __future__ import annotations
2+
3+
from typing import Literal
4+
5+
import ibis
6+
7+
from mismo._resolve import IntoValueResolver, value_resolver
8+
from mismo.linkage import _linkage
9+
from mismo.linker import _common, _join_linker
10+
11+
12+
class UnnestLinker(_common.Linker):
13+
"""A [Linker][mismo.Linker] that unnests a column before linking.
14+
15+
This is useful if you records with sets of tokens that you want to link on,
16+
for example:
17+
- splitting names into words/tokens and linking where any token matches.
18+
- tags, such as product categories, where you want to link where any tag matches.
19+
20+
This links where ANY of the unnested values match.
21+
22+
Examples
23+
--------
24+
>>> import ibis
25+
>>> from ibis import _
26+
>>> import mismo
27+
>>> ibis.options.interactive = True
28+
>>> linkage = mismo.playdata.load_patents()
29+
>>> t = linkage.left.select("record_id", "name")
30+
>>> t.head()
31+
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
32+
┃ record_id ┃ name ┃
33+
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
34+
│ uint32 │ string │
35+
├───────────┼──────────────────────────────┤
36+
│ 2909 │ * AGILENT TECHNOLOGIES, INC. │
37+
│ 3574 │ * AKZO NOBEL N.V. │
38+
│ 3575 │ * AKZO NOBEL NV │
39+
│ 3779 │ * ALCATEL N.V. │
40+
│ 3780 │ * ALCATEL N.V. │
41+
└───────────┴──────────────────────────────┘
42+
43+
>>> tokens = _.name.upper().split(" ").filter(lambda x: x.length() > 4)
44+
>>> t.select(tokens.name("tokens"))
45+
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
46+
┃ tokens ┃
47+
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
48+
│ array<string> │
49+
├──────────────────────────────┤
50+
│ ['AGILENT', 'TECHNOLOGIES,'] │
51+
│ ['NOBEL'] │
52+
│ ['NOBEL'] │
53+
│ ['ALCATEL'] │
54+
│ ['ALCATEL'] │
55+
│ ['ALCATEL'] │
56+
│ ['CANON', 'EUROPA'] │
57+
│ ['CANON', 'EUROPA'] │
58+
│ ['CANON', 'EUROPA'] │
59+
│ [] │
60+
│ … │
61+
└──────────────────────────────┘
62+
63+
Now, block the tables together wherever two records share a token.
64+
Note that this blocked `* SCHLUMBERGER LIMITED` with `* SCHLUMBERGER TECHNOLOGY BV`.
65+
because they both share the `SCHLUMBERGER` token.
66+
67+
>>> linker = mismo.KeyLinker(tokens.unnest())
68+
>>> linker(t, t).links.filter(_.name_l != _.name_r).order_by(
69+
... "record_id_l", "record_id_r"
70+
... ).head() # doctest: +SKIP
71+
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
72+
┃ record_id_l ┃ record_id_r ┃ latitude_l ┃ latitude_r ┃ name_l ┃ name_r ┃
73+
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
74+
│ int64 │ int64 │ float64 │ float64 │ string │ string │
75+
├─────────────┼─────────────┼────────────┼────────────┼────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────┤
76+
│ 2909 │ 13390969 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V │
77+
│ 2909 │ 13390970 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V. │
78+
│ 2909 │ 13391015 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherland B.V. │
79+
│ 2909 │ 13391055 │ 0.0 │ 52.50 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
80+
│ 2909 │ 13391056 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
81+
└─────────────┴─────────────┴────────────┴────────────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
82+
"""
83+
84+
def __init__(
85+
self,
86+
column: IntoValueResolver,
87+
*,
88+
task: Literal["dedupe", "link"] | None = None,
89+
):
90+
self.column_resolver = value_resolver(column)
91+
self.task = task
92+
self._linker = _join_linker.JoinLinker(self.column_resolver, task=task)
93+
94+
def __call__(self, left: ibis.Table, right: ibis.Table) -> _linkage.Linkage:
95+
resolved_left = self.column_resolver(left)
96+
resolved_right = self.column_resolver(right)
97+
left = left.mutate(resolved_left.unnest().name(resolved_left.get_name()))
98+
right = right.mutate(resolved_right.unnest().name(resolved_right.get_name()))
99+
return self._linker.__call__(left, right)

0 commit comments

Comments
 (0)