1010
1111class FullLinker (_common .Linker ):
1212 """
13- A [Linker][mismo.Linker] that yields all possible pairs.
14-
15- This will be N x M pairs for linking tasks,
16- and N x (M-1) pairs for deduplication tasks.
13+ A [Linker][mismo.Linker] that yields all possible pairs (MxN of them).
1714 """
1815
1916 def __init__ (self , * , task : Literal ["dedupe" , "link" ] | None = None ):
@@ -29,34 +26,12 @@ def __call__(self, left: ibis.Table, right: ibis.Table) -> _linkage.Linkage:
2926 return self ._linker (left , right )
3027
3128
32- def full_linkage (
33- left : ibis .Table ,
34- right : ibis .Table ,
35- * ,
36- task : Literal ["dedupe" , "link" ] | None = None ,
37- ) -> _linkage .Linkage :
38- """
39- Create a linkage with all (M x N) possible pairs of the two tables.
40-
41- Parameters
42- ----------
43- left
44- The left table to link.
45- right
46- The right table to link.
47- task
48- The task type, either "dedupe" or "link".
49- If None, will be inferred from whether the left and right tables are the same.
50- """
51- return FullLinker (task = task )(left , right )
52-
53-
5429class EmptyLinker (_common .Linker ):
5530 """A [Linker][mismo.Linker] that yields no pairs."""
5631
57- def __init__ (self ):
58- # The task doesn't matter here, since we won't be linking anything.
59- self ._linker = _join_linker .JoinLinker (False , on_slow = "ignore" , task = "link" )
32+ def __init__ (self , * , task : Literal [ "dedupe" , "link" ] | None = None ):
33+ self . task = task
34+ self ._linker = _join_linker .JoinLinker (False , on_slow = "ignore" , task = task )
6035
6136 def __join_condition__ (
6237 self , left : ibis .Table , right : ibis .Table
@@ -67,22 +42,52 @@ def __call__(self, left: ibis.Table, right: ibis.Table) -> _linkage.Linkage:
6742 return self ._linker (left , right )
6843
6944
70- def empty_linkage (left : ibis .Table , right : ibis .Table ) -> _linkage .Linkage :
71- """
72- Create a Linkage with no pairs. This is useful for testing or as a placeholder.
73-
74- Parameters
75- ----------
76- left
77- The left table to link.
78- right
79- The right table to link.
80- """
81- return EmptyLinker ()(left , right )
82-
83-
8445class UnnestLinker (_common .Linker ):
85- """A [Linker][mismo.Linker] that unnests a column before linking."""
46+ """A [Linker][mismo.Linker] that unnests a column before linking.
47+
48+ We can even block on arrays! For example, first let's split each name into
49+ significant tokens:
50+
51+ >>> tokens = _.name.upper().split(" ").filter(lambda x: x.length() > 4)
52+ >>> t.select(tokens.name("tokens"))
53+ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
54+ ┃ tokens ┃
55+ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
56+ │ array<string> │
57+ ├──────────────────────────────┤
58+ │ ['AGILENT', 'TECHNOLOGIES,'] │
59+ │ ['NOBEL'] │
60+ │ ['NOBEL'] │
61+ │ ['ALCATEL'] │
62+ │ ['ALCATEL'] │
63+ │ ['ALCATEL'] │
64+ │ ['CANON', 'EUROPA'] │
65+ │ ['CANON', 'EUROPA'] │
66+ │ ['CANON', 'EUROPA'] │
67+ │ [] │
68+ │ … │
69+ └──────────────────────────────┘
70+
71+ Now, block the tables together wherever two records share a token.
72+ Note that this blocked `* SCHLUMBERGER LIMITED` with `* SCHLUMBERGER TECHNOLOGY BV`.
73+ because they both share the `SCHLUMBERGER` token.
74+
75+ >>> linker = mismo.KeyLinker(tokens.unnest())
76+ >>> linker(t, t).links.filter(_.name_l != _.name_r).order_by(
77+ ... "record_id_l", "record_id_r"
78+ ... ).head() # doctest: +SKIP
79+ ┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
80+ ┃ record_id_l ┃ record_id_r ┃ latitude_l ┃ latitude_r ┃ name_l ┃ name_r ┃
81+ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
82+ │ int64 │ int64 │ float64 │ float64 │ string │ string │
83+ ├─────────────┼─────────────┼────────────┼────────────┼────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────┤
84+ │ 2909 │ 13390969 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V │
85+ │ 2909 │ 13390970 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V. │
86+ │ 2909 │ 13391015 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherland B.V. │
87+ │ 2909 │ 13391055 │ 0.0 │ 52.50 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
88+ │ 2909 │ 13391056 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
89+ └─────────────┴─────────────┴────────────┴────────────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
90+ """
8691
8792 def __init__ (self , column : str , * , task : Literal ["dedupe" , "link" ] | None = None ):
8893 self .column = column
0 commit comments