Skip to content

Commit 8984196

Browse files
committed
docs: improve docstring of KeyLinker
1 parent 34cb815 commit 8984196

File tree

1 file changed

+5
-49
lines changed

1 file changed

+5
-49
lines changed

mismo/linker/_key_linker.py

Lines changed: 5 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
class KeyLinker(Linker):
1717
"""A [Linker][mismo.Linker] that links records wherever they share a key, eg "emails match."
1818
19-
This is one of the most basic blocking rules, used very often in record linkage.
20-
This is what is used in `splink`.
19+
This is one of the most basic linking rules, used very often in record linkage.
2120
2221
Examples
2322
--------
@@ -65,7 +64,7 @@ class KeyLinker(Linker):
6564
- the latitudes, rounded to 1 decimal place, are the same
6665
6766
>>> linker = mismo.KeyLinker((_["name"][:5].upper(), _.latitude.round(1)))
68-
>>> blocker(t, t).order_by("record_id_l", "record_id_r").head() # doctest: +SKIP
67+
>>> linker(t, t).order_by("record_id_l", "record_id_r").head() # doctest: +SKIP
6968
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
7069
┃ record_id_l ┃ record_id_r ┃ latitude_l ┃ latitude_r ┃ name_l ┃ name_r ┃
7170
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
@@ -77,49 +76,6 @@ class KeyLinker(Linker):
7776
│ 15041 │ 15043 │ 0.00 │ 0.00 │ * CANON EUROPA N.V │ * CANON EUROPA NV │
7877
│ 15042 │ 15043 │ 0.00 │ 0.00 │ * CANON EUROPA N.V. │ * CANON EUROPA NV │
7978
└─────────────┴─────────────┴────────────┴────────────┴─────────────────────┴─────────────────────┘
80-
81-
We can even block on arrays! For example, first let's split each name into
82-
significant tokens:
83-
84-
>>> tokens = _.name.upper().split(" ").filter(lambda x: x.length() > 4)
85-
>>> t.select(tokens.name("tokens"))
86-
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
87-
┃ tokens ┃
88-
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
89-
│ array<string> │
90-
├──────────────────────────────┤
91-
│ ['AGILENT', 'TECHNOLOGIES,'] │
92-
│ ['NOBEL'] │
93-
│ ['NOBEL'] │
94-
│ ['ALCATEL'] │
95-
│ ['ALCATEL'] │
96-
│ ['ALCATEL'] │
97-
│ ['CANON', 'EUROPA'] │
98-
│ ['CANON', 'EUROPA'] │
99-
│ ['CANON', 'EUROPA'] │
100-
│ [] │
101-
│ … │
102-
└──────────────────────────────┘
103-
104-
Now, block the tables together wherever two records share a token.
105-
Note that this blocked `* SCHLUMBERGER LIMITED` with `* SCHLUMBERGER TECHNOLOGY BV`.
106-
because they both share the `SCHLUMBERGER` token.
107-
108-
>>> linker = mismo.KeyLinker(tokens.unnest())
109-
>>> linker(t, t).links.filter(_.name_l != _.name_r).order_by(
110-
... "record_id_l", "record_id_r"
111-
... ).head() # doctest: +SKIP
112-
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
113-
┃ record_id_l ┃ record_id_r ┃ latitude_l ┃ latitude_r ┃ name_l ┃ name_r ┃
114-
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
115-
│ int64 │ int64 │ float64 │ float64 │ string │ string │
116-
├─────────────┼─────────────┼────────────┼────────────┼────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────┤
117-
│ 2909 │ 13390969 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V │
118-
│ 2909 │ 13390970 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Inc. Netherlands B.V. │
119-
│ 2909 │ 13391015 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherland B.V. │
120-
│ 2909 │ 13391055 │ 0.0 │ 52.50 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
121-
│ 2909 │ 13391056 │ 0.0 │ 52.35 │ * AGILENT TECHNOLOGIES, INC. │ Hitachi Global Storage Technologies, Netherlands, B.V. │
122-
└─────────────┴─────────────┴────────────┴────────────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
12379
""" # noqa: E501
12480

12581
def __init__(
@@ -153,13 +109,13 @@ def __init__(
153109
max_pairs: int | None = None,
154110
task: Literal["dedupe", "lookup", "link"] | None = None,
155111
) -> None:
156-
"""Create a KeyBlocker.
112+
"""Create a KeyLinker.
157113
158114
Parameters
159115
----------
160116
keys:
161-
The keys to block on.
162-
The tables will be blocked together wherever they share ALL the keys.
117+
The keys to link on.
118+
The tables will be linked wherever they share ALL the keys.
163119
Each key can be any of the following:
164120
165121
- A string, which is interpreted as the name of a column in both tables.

0 commit comments

Comments
 (0)