Skip to content

Commit a60dfc6

Browse files
author
Maarten Grootendorst
authored
v0.3 (#17)
* Extract multiple best matches * Add top_n to TF-IDF and Embeddings * Update documentation and prepare for release
1 parent fbe0cbb commit a60dfc6

File tree

9 files changed

+62
-25
lines changed

9 files changed

+62
-25
lines changed

docs/releases.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
v0.3.0
2+
- Use `top_n` in `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings`
3+
14
v0.2.2
25
- Update grouping to include all strings only if identical lists of strings are compared
36

polyfuzz/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
from .polyfuzz import PolyFuzz
2-
__version__ = "0.2.2"
2+
__version__ = "0.3.0"

polyfuzz/models/_base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ def __init__(self, model_id: str = "Model 0"):
1111
self.type = "Base Model"
1212

1313
@abstractmethod
14-
def match(self, from_list: List[str], to_list: List[str]) -> pd.DataFrame:
14+
def match(self,
15+
from_list: List[str],
16+
to_list: List[str]) -> pd.DataFrame:
1517
""" Make sure you follow the same argument structure:
1618
1719
Arguments:

polyfuzz/models/_embeddings.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class Embeddings(BaseMatcher):
2020
Arguments:
2121
embedding_method: list of Flair embeddings to use
2222
min_similarity: The minimum similarity between strings, otherwise return 0 similarity
23+
top_n: The number of best matches you want returned
2324
cosine_method: The method/package for calculating the cosine similarity.
2425
Options: "sparse", "sklearn", "knn".
2526
Sparse is the fastest and most memory efficient but requires a
@@ -59,6 +60,7 @@ class Embeddings(BaseMatcher):
5960
def __init__(self,
6061
embedding_method: Union[List, None] = None,
6162
min_similarity: float = 0.75,
63+
top_n: int = 1,
6264
cosine_method: str = "sparse",
6365
model_id: str = None):
6466
super().__init__(model_id)
@@ -77,6 +79,7 @@ def __init__(self,
7779
self.document_embeddings = embedding_method
7880

7981
self.min_similarity = min_similarity
82+
self.top_n = top_n
8083
self.cosine_method = cosine_method
8184

8285
def match(self,
@@ -110,7 +113,9 @@ def match(self,
110113

111114
matches = cosine_similarity(embeddings_from, embeddings_to,
112115
from_list, to_list,
113-
self.min_similarity, self.cosine_method)
116+
self.min_similarity,
117+
top_n=self.top_n,
118+
method=self.cosine_method)
114119

115120
return matches
116121

polyfuzz/models/_tfidf.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class TFIDF(BaseMatcher):
2121
n_gram_range: The n_gram_range on a character-level
2222
clean_string: Whether to clean the string such that only alphanumerical characters are kept
2323
min_similarity: The minimum similarity between strings, otherwise return 0 similarity
24+
top_n: The number of matches you want returned
2425
cosine_method: The method/package for calculating the cosine similarity.
2526
Options:
2627
* sparse
@@ -48,6 +49,7 @@ def __init__(self,
4849
n_gram_range: Tuple[int, int] = (3, 3),
4950
clean_string: bool = True,
5051
min_similarity: float = 0.75,
52+
top_n: int = 1,
5153
cosine_method: str = "sparse",
5254
model_id: str = None):
5355
super().__init__(model_id)
@@ -56,6 +58,7 @@ def __init__(self,
5658
self.clean_string = clean_string
5759
self.min_similarity = min_similarity
5860
self.cosine_method = cosine_method
61+
self.top_n = top_n
5962

6063
def match(self,
6164
from_list: List[str],
@@ -82,7 +85,9 @@ def match(self,
8285
tf_idf_from, tf_idf_to = self._extract_tf_idf(from_list, to_list)
8386
matches = cosine_similarity(tf_idf_from, tf_idf_to,
8487
from_list, to_list,
85-
self.min_similarity, self.cosine_method)
88+
self.min_similarity,
89+
top_n=self.top_n,
90+
method=self.cosine_method)
8691

8792
return matches
8893

polyfuzz/models/_utils.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def cosine_similarity(from_vector: np.ndarray,
1717
from_list: List[str],
1818
to_list: List[str],
1919
min_similarity: float = 0.75,
20+
top_n: int = 1,
2021
method: str = "sparse") -> pd.DataFrame:
2122
""" Calculate similarity between two matrices/vectors and return best matches
2223
@@ -26,6 +27,7 @@ def cosine_similarity(from_vector: np.ndarray,
2627
from_list: The list from which you want mappings
2728
to_list: The list where you want to map to
2829
min_similarity: The minimum similarity between strings, otherwise return 0 similarity
30+
top_n: The number of best matches you want returned
2931
method: The method/package for calculating the cosine similarity.
3032
Options: "sparse", "sklearn", "knn".
3133
Sparse is the fastest and most memory efficient but requires a
@@ -49,20 +51,22 @@ def cosine_similarity(from_vector: np.ndarray,
4951
indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse")
5052
```
5153
"""
54+
if top_n > len(set(to_list)):
55+
top_n = len(set(to_list))
56+
5257
# Slower but uses less memory
5358
if method == "knn":
5459

5560
if from_list == to_list:
56-
knn = NearestNeighbors(n_neighbors=2, n_jobs=-1, metric='cosine').fit(to_vector)
61+
knn = NearestNeighbors(n_neighbors=top_n+1, n_jobs=-1, metric='cosine').fit(to_vector)
5762
distances, indices = knn.kneighbors(from_vector)
58-
distances = distances[:, 1]
59-
indices = indices[:, 1]
60-
63+
distances = distances[:, 1:]
64+
indices = indices[:, 1:]
6165
else:
62-
knn = NearestNeighbors(n_neighbors=1, n_jobs=-1, metric='cosine').fit(to_vector)
66+
knn = NearestNeighbors(n_neighbors=top_n, n_jobs=-1, metric='cosine').fit(to_vector)
6367
distances, indices = knn.kneighbors(from_vector)
6468

65-
similarity = [round(1 - distance, 3) for distance in distances.flatten()]
69+
similarities = [np.round(1 - distances[:, i], 3) for i in range(distances.shape[1])]
6670

6771
# Fast, but does has some installation issues
6872
elif _HAVE_SPARSE_DOT and method == "sparse":
@@ -74,15 +78,16 @@ def cosine_similarity(from_vector: np.ndarray,
7478
# There is a bug with awesome_cossim_topn that when to_vector and from_vector
7579
# have the same shape, setting topn to 1 does not work. Apparently, you need
7680
# to it at least to 2 for it to work
77-
similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, 2, min_similarity)
81+
similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, top_n+1, min_similarity)
7882

7983
if from_list == to_list:
8084
similarity_matrix = similarity_matrix.tolil()
8185
similarity_matrix.setdiag(0.)
8286
similarity_matrix = similarity_matrix.tocsr()
8387

84-
indices = np.array(similarity_matrix.argmax(axis=1).T).flatten()
85-
similarity = similarity_matrix.max(axis=1).toarray().T.flatten()
88+
indices = np.flip(np.argsort(similarity_matrix.toarray(), axis=-1), axis=1)[:, :top_n]
89+
similarities = np.flip(np.sort(similarity_matrix.toarray(), axis=-1), axis=1)[:, :top_n]
90+
similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]
8691

8792
# Faster than knn and slower than sparse but uses more memory
8893
else:
@@ -91,13 +96,26 @@ def cosine_similarity(from_vector: np.ndarray,
9196
if from_list == to_list:
9297
np.fill_diagonal(similarity_matrix, 0)
9398

94-
indices = similarity_matrix.argmax(axis=1)
95-
similarity = similarity_matrix.max(axis=1)
99+
indices = np.flip(np.argsort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
100+
similarities = np.flip(np.sort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
101+
similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]
96102

97103
# Convert results to df
98-
matches = [to_list[idx] for idx in indices.flatten()]
99-
matches = pd.DataFrame(np.vstack((from_list, matches, similarity)).T, columns=["From", "To", "Similarity"])
100-
matches.Similarity = matches.Similarity.astype(float)
101-
matches.loc[matches.Similarity < 0.001, "To"] = None
104+
columns = (["From"] +
105+
["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] +
106+
["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)]); columns
107+
matches = [[to_list[idx] for idx in indices[:, i]] for i in range(indices.shape[1])]
108+
matches = pd.DataFrame(np.vstack(([from_list], matches, similarities)).T, columns = columns)
109+
110+
# Update column order
111+
columns = [["From", "To", "Similarity"]] + [[f"To_{i+2}", f"Similarity_{i+2}"] for i in range((top_n-1))]
112+
matches = matches.loc[:, [title for column in columns for title in column]]
113+
114+
# Update types
115+
for column in matches.columns:
116+
if "Similarity" in column:
117+
matches[column] = matches[column].astype(float)
118+
matches.loc[matches[column] < 0.001, column] = float(0)
119+
matches.loc[matches[column] < 0.001, column.replace("Similarity", "To")] = None
102120

103121
return matches

polyfuzz/polyfuzz.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,17 @@ def __init__(self,
8585

8686
def match(self,
8787
from_list: List[str],
88-
to_list: List[str]):
88+
to_list: List[str],
89+
top_n: int = 1):
8990
""" Match the from_list of strings to the to_list of strings with whatever models
9091
you have initialized
9192
9293
Arguments:
9394
from_list: The list from which you want mappings
9495
to_list: The list where you want to map to
96+
top_n: The number of matches you want returned. This is currently only implemented
97+
for `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` as they
98+
can computationally handle more comparisons.
9599
96100
Updates:
97101
self.matches: A dictionary with the matches from all models, can
@@ -115,11 +119,11 @@ def match(self,
115119
# Standard models - quick access
116120
if isinstance(self.method, str):
117121
if self.method in ["TF-IDF", "TFIDF"]:
118-
self.matches = {"TF-IDF": TFIDF(min_similarity=0).match(from_list, to_list)}
122+
self.matches = {"TF-IDF": TFIDF(min_similarity=0, top_n=top_n).match(from_list, to_list)}
119123
elif self.method in ["EditDistance", "Edit Distance"]:
120124
self.matches = {"EditDistance": RapidFuzz().match(from_list, to_list)}
121125
elif self.method in ["Embeddings", "Embedding"]:
122-
self.matches = {"Embeddings": Embeddings(min_similarity=0).match(from_list, to_list)}
126+
self.matches = {"Embeddings": Embeddings(min_similarity=0, top_n=top_n).match(from_list, to_list)}
123127
else:
124128
raise ValueError("Please instantiate the model with one of the following methods: \n"
125129
"* 'TF-IDF'\n"
@@ -242,7 +246,7 @@ def get_ids(self) -> Union[str, List[str], None]:
242246
return None
243247

244248
def get_matches(self, model_id: str = None) -> Union[pd.DataFrame,
245-
Mapping[str, pd.DataFrame]]:
249+
Mapping[str, pd.DataFrame]]:
246250
""" Get the matches from one or more models"""
247251
check_matches(self)
248252

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
setup(
3838
name="polyfuzz",
3939
packages=find_packages(exclude=["notebooks", "docs"]),
40-
version="0.2.2",
40+
version="0.3.0",
4141
author="Maarten Grootendorst",
4242
author_email="maartengrootendorst@gmail.com",
4343
description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",

tests/test_linkage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,6 @@ def test_linkage(min_similarity):
2626
assert max(cluster_mapping.values()) == 1
2727
assert len(cluster_name_map) == 2
2828

29-
else:
29+
elif min_similarity >= 0.6:
3030
assert max(cluster_mapping.values()) > 1
3131
assert len(cluster_name_map) == 3

0 commit comments

Comments
 (0)