Skip to content

Commit 233de4b

Browse files
committed
breaking: remove empty_linkage() and full_linkage()
You should do EmptyLinker()(left, right) and FullLinker()(left, right) instead
1 parent 8984196 commit 233de4b

File tree

5 files changed

+51
-50
lines changed

5 files changed

+51
-50
lines changed

mismo/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@
3838
from mismo.linker import KeyLinker as KeyLinker
3939
from mismo.linker import Linker as Linker
4040
from mismo.linker import OrLinker as OrLinker
41-
from mismo.linker import empty_linkage as empty_linkage
42-
from mismo.linker import full_linkage as full_linkage
4341
from mismo.types import Diff as Diff
4442
from mismo.types import DiffStats as DiffStats
4543
from mismo.types import LinkCountsTable as LinkCountsTable

mismo/lib/name/tests/test_name_dimension.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_name_dimension(name_table, table_factory):
2828
"name_compared": "int8",
2929
},
3030
)
31-
links = mismo.full_linkage(name_table, name_table, task="link").links
31+
links = mismo.FullLinker()(name_table, name_table, task="link").links
3232
compared = dim.compare(links)
3333
compared = compared.semi_join(expected, ["record_id_l", "record_id_r"])
3434
compared = compared.cache()

mismo/linker/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
from mismo.linker._basic import EmptyLinker as EmptyLinker
44
from mismo.linker._basic import FullLinker as FullLinker
55
from mismo.linker._basic import UnnestLinker as UnnestLinker
6-
from mismo.linker._basic import empty_linkage as empty_linkage
7-
from mismo.linker._basic import full_linkage as full_linkage
86
from mismo.linker._common import Linker as Linker
97
from mismo.linker._id_linker import IDLinker as IDLinker
108
from mismo.linker._join_linker import JoinLinker as JoinLinker

mismo/linker/_basic.py

Lines changed: 49 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,7 @@
1010

1111
class FullLinker(_common.Linker):
1212
"""
13-
A [Linker][mismo.Linker] that yields all possible pairs.
14-
15-
This will be N x M pairs for linking tasks,
16-
and N x (M-1) pairs for deduplication tasks.
13+
A [Linker][mismo.Linker] that yields all possible pairs (MxN of them).
1714
"""
1815

1916
def __init__(self, *, task: Literal["dedupe", "link"] | None = None):
@@ -29,34 +26,12 @@ def __call__(self, left: ibis.Table, right: ibis.Table) -> _linkage.Linkage:
2926
return self._linker(left, right)
3027

3128

32-
def full_linkage(
33-
left: ibis.Table,
34-
right: ibis.Table,
35-
*,
36-
task: Literal["dedupe", "link"] | None = None,
37-
) -> _linkage.Linkage:
38-
"""
39-
Create a linkage with all (M x N) possible pairs of the two tables.
40-
41-
Parameters
42-
----------
43-
left
44-
The left table to link.
45-
right
46-
The right table to link.
47-
task
48-
The task type, either "dedupe" or "link".
49-
If None, will be inferred from whether the left and right tables are the same.
50-
"""
51-
return FullLinker(task=task)(left, right)
52-
53-
5429
class EmptyLinker(_common.Linker):
5530
"""A [Linker][mismo.Linker] that yields no pairs."""
5631

57-
def __init__(self):
58-
# The task doesn't matter here, since we won't be linking anything.
59-
self._linker = _join_linker.JoinLinker(False, on_slow="ignore", task="link")
32+
def __init__(self, *, task: Literal["dedupe", "link"] | None = None):
33+
self.task = task
34+
self._linker = _join_linker.JoinLinker(False, on_slow="ignore", task=task)
6035

6136
def __join_condition__(
6237
self, left: ibis.Table, right: ibis.Table
@@ -67,22 +42,52 @@ def __call__(self, left: ibis.Table, right: ibis.Table) -> _linkage.Linkage:
6742
return self._linker(left, right)
6843

6944

70-
def empty_linkage(left: ibis.Table, right: ibis.Table) -> _linkage.Linkage:
71-
"""
72-
Create a Linkage with no pairs. This is useful for testing or as a placeholder.
73-
74-
Parameters
75-
----------
76-
left
77-
The left table to link.
78-
right
79-
The right table to link.
80-
"""
81-
return EmptyLinker()(left, right)
82-
83-
8445
class UnnestLinker(_common.Linker):
85-
"""A [Linker][mismo.Linker] that unnests a column before linking."""
46+
"""A [Linker][mismo.Linker] that unnests a column before linking.
47+
48+
We can even block on arrays! For example, first let's split each name into
49+
significant tokens:
50+
51+
>>> tokens = _.name.upper().split(" ").filter(lambda x: x.length() > 4)
52+
>>> t.select(tokens.name("tokens"))
53+
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
54+
┃ tokens ┃
55+
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
56+
│ array<string> │
57+
├──────────────────────────────┤
58+
│ ['AGILENT', 'TECHNOLOGIES,'] │
59+
│ ['NOBEL'] │
60+
│ ['NOBEL'] │
61+
│ ['ALCATEL'] │
62+
│ ['ALCATEL'] │
63+
│ ['ALCATEL'] │
64+
│ ['CANON', 'EUROPA'] │
65+
│ ['CANON', 'EUROPA'] │
66+
│ ['CANON', 'EUROPA'] │
67+
│ [] │
68+
│ … │
69+
└──────────────────────────────┘
70+
71+
Now, block the tables together wherever two records share a token.
72+
Note that this blocked `* SCHLUMBERGER LIMITED` with `* SCHLUMBERGER TECHNOLOGY BV`.
73+
because they both share the `SCHLUMBERGER` token.
74+
75+
>>> linker = mismo.KeyLinker(tokens.unnest())
76+
>>> linker(t, t).links.filter(_.name_l != _.name_r).order_by(
77+
... "record_id_l", "record_id_r"
78+
... ).head() # doctest: +SKIP
79+
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
80+
┃ record_id_l ┃ record_id_r ┃ latitude_l ┃ latitude_r ┃ name_l ┃ name_r ┃
81+
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
82+
│ int64 │ int64 │ float64 │ float64 │ string │ string │
83+
├─────────────┼─────────────┼────────────┼────────────┼────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────┤
84+
│ 2909 │ 13390969 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V │
85+
│ 2909 │ 13390970 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V. │
86+
│ 2909 │ 13391015 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherland B.V. │
87+
│ 2909 │ 13391055 │ 0.0 │ 52.50 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
88+
│ 2909 │ 13391056 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
89+
└─────────────┴─────────────┴────────────┴────────────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
90+
"""
8691

8792
def __init__(self, column: str, *, task: Literal["dedupe", "link"] | None = None):
8893
self.column = column

mismo/linker/_or_linker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def __call__(self, left: ibis.Table, right: ibis.Table) -> Linkage:
5959
if left is right:
6060
right = right.view()
6161
if not self._join_conditions:
62-
return mismo.empty_linkage(left, right)
62+
return mismo.EmptyLinker()(left, right)
6363
conditions = [
6464
c.__join_condition__(left, right) for c in self._join_conditions.values()
6565
]

0 commit comments

Comments
 (0)