1616class KeyLinker (Linker ):
1717 """A [Linker][mismo.Linker] that links records wherever they share a key, eg "emails match."
1818
19- This is one of the most basic blocking rules, used very often in record linkage.
20- This is what is used in `splink`.
19+ This is one of the most basic linking rules, used very often in record linkage.
2120
2221 Examples
2322 --------
@@ -65,7 +64,7 @@ class KeyLinker(Linker):
6564 - the latitudes, rounded to 1 decimal place, are the same
6665
6766 >>> linker = mismo.KeyLinker((_["name"][:5].upper(), _.latitude.round(1)))
68- >>> blocker (t, t).order_by("record_id_l", "record_id_r").head() # doctest: +SKIP
67+ >>> linker (t, t).order_by("record_id_l", "record_id_r").head() # doctest: +SKIP
6968 ┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
7069 ┃ record_id_l ┃ record_id_r ┃ latitude_l ┃ latitude_r ┃ name_l ┃ name_r ┃
7170 ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
@@ -77,49 +76,6 @@ class KeyLinker(Linker):
7776 │ 15041 │ 15043 │ 0.00 │ 0.00 │ * CANON EUROPA N.V │ * CANON EUROPA NV │
7877 │ 15042 │ 15043 │ 0.00 │ 0.00 │ * CANON EUROPA N.V. │ * CANON EUROPA NV │
7978 └─────────────┴─────────────┴────────────┴────────────┴─────────────────────┴─────────────────────┘
80-
81- We can even block on arrays! For example, first let's split each name into
82- significant tokens:
83-
84- >>> tokens = _.name.upper().split(" ").filter(lambda x: x.length() > 4)
85- >>> t.select(tokens.name("tokens"))
86- ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
87- ┃ tokens ┃
88- ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
89- │ array<string> │
90- ├──────────────────────────────┤
91- │ ['AGILENT', 'TECHNOLOGIES,'] │
92- │ ['NOBEL'] │
93- │ ['NOBEL'] │
94- │ ['ALCATEL'] │
95- │ ['ALCATEL'] │
96- │ ['ALCATEL'] │
97- │ ['CANON', 'EUROPA'] │
98- │ ['CANON', 'EUROPA'] │
99- │ ['CANON', 'EUROPA'] │
100- │ [] │
101- │ … │
102- └──────────────────────────────┘
103-
104- Now, block the tables together wherever two records share a token.
105- Note that this blocked `* SCHLUMBERGER LIMITED` with `* SCHLUMBERGER TECHNOLOGY BV`.
106- because they both share the `SCHLUMBERGER` token.
107-
108- >>> linker = mismo.KeyLinker(tokens.unnest())
109- >>> linker(t, t).links.filter(_.name_l != _.name_r).order_by(
110- ... "record_id_l", "record_id_r"
111- ... ).head() # doctest: +SKIP
112- ┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
113- ┃ record_id_l ┃ record_id_r ┃ latitude_l ┃ latitude_r ┃ name_l ┃ name_r ┃
114- ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
115- │ int64 │ int64 │ float64 │ float64 │ string │ string │
116- ├─────────────┼─────────────┼────────────┼────────────┼────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────┤
117- │ 2909 │ 13390969 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V │
118- │ 2909 │ 13390970 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V. │
119- │ 2909 │ 13391015 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherland B.V. │
120- │ 2909 │ 13391055 │ 0.0 │ 52.50 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
121- │ 2909 │ 13391056 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
122- └─────────────┴─────────────┴────────────┴────────────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
12379 """ # noqa: E501
12480
12581 def __init__ (
@@ -153,13 +109,13 @@ def __init__(
153109 max_pairs : int | None = None ,
154110 task : Literal ["dedupe" , "lookup" , "link" ] | None = None ,
155111 ) -> None :
156- """Create a KeyBlocker .
112+ """Create a KeyLinker .
157113
158114 Parameters
159115 ----------
160116 keys:
161- The keys to block on.
162- The tables will be blocked together wherever they share ALL the keys.
117+ The keys to link on.
118+ The tables will be linked wherever they share ALL the keys.
163119 Each key can be any of the following:
164120
165121 - A string, which is interpreted as the name of a column in both tables.
0 commit comments