Skip to content

Commit d2ef5b2

Browse files
author
Maarten Grootendorst
authored
Sparse dot fix (#22)
1 parent a60dfc6 commit d2ef5b2

File tree

4 files changed

+33
-10
lines changed

4 files changed

+33
-10
lines changed

docs/releases.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
v0.3.1
2+
- Fix exploding memory usage when using `top_n`
3+
14
v0.3.0
25
- Use `top_n` in `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings`
36

@@ -16,8 +19,7 @@ v0.1.0
1619
- More thorough documentation
1720
- Prepare for public release
1821

19-
v0.0.1
20-
22+
v0.0.1
2123
- First release of `PolyFuzz`
2224
- Matching through:
2325
- Edit Distance

polyfuzz/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
from .polyfuzz import PolyFuzz
2-
__version__ = "0.3.0"
2+
__version__ = "0.3.1"

polyfuzz/models/_utils.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ def cosine_similarity(from_vector: np.ndarray,
8585
similarity_matrix.setdiag(0.)
8686
similarity_matrix = similarity_matrix.tocsr()
8787

88-
indices = np.flip(np.argsort(similarity_matrix.toarray(), axis=-1), axis=1)[:, :top_n]
89-
similarities = np.flip(np.sort(similarity_matrix.toarray(), axis=-1), axis=1)[:, :top_n]
90-
similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]
88+
indices = _top_n_idx_sparse(similarity_matrix, top_n)
89+
similarities = _top_n_similarities_sparse(similarity_matrix, top_n, indices)
90+
indices = np.array(np.nan_to_num(np.array(indices, dtype=np.float), nan=0), dtype=np.int)
9191

9292
# Faster than knn and slower than sparse but uses more memory
9393
else:
@@ -101,9 +101,9 @@ def cosine_similarity(from_vector: np.ndarray,
101101
similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]
102102

103103
# Convert results to df
104-
columns = (["From"] +
105-
["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] +
106-
["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)]); columns
104+
columns = (["From"] +
105+
["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] +
106+
["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)])
107107
matches = [[to_list[idx] for idx in indices[:, i]] for i in range(indices.shape[1])]
108108
matches = pd.DataFrame(np.vstack(([from_list], matches, similarities)).T, columns = columns)
109109

@@ -119,3 +119,24 @@ def cosine_similarity(from_vector: np.ndarray,
119119
matches.loc[matches[column] < 0.001, column.replace("Similarity", "To")] = None
120120

121121
return matches
122+
123+
124+
def _top_n_idx_sparse(matrix, n):
125+
""" Return index of top n values in each row of a sparse matrix """
126+
top_n_idx = []
127+
for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
128+
n_row_pick = min(n, ri - le)
129+
values = list(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]])[::-1]
130+
values = [values[index] if len(values) >= index + 1 else None for index in range(n)]
131+
top_n_idx.append(values)
132+
return np.array(top_n_idx)
133+
134+
135+
def _top_n_similarities_sparse(matrix, n, indices):
136+
""" Return similarity scores of top n values in each row of a sparse matrix """
137+
similarity_scores = []
138+
for row, values in enumerate(indices):
139+
scores = [round(matrix[row, value], n) if value is not None else 0 for value in values]
140+
similarity_scores.append(scores)
141+
similarity_scores = np.array(similarity_scores).T
142+
return similarity_scores

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
setup(
3838
name="polyfuzz",
3939
packages=find_packages(exclude=["notebooks", "docs"]),
40-
version="0.3.0",
40+
version="0.3.1",
4141
author="Maarten Grootendorst",
4242
author_email="maartengrootendorst@gmail.com",
4343
description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",

0 commit comments

Comments
 (0)