Skip to content

Commit 0338831

Browse files
committed
feat: Auto cleanup scores with weakref.finalize
It wasn't stated anywhere in the docs that you have to cleanup the possibly-a-tempfile, so people might not have been doing it. And, I thought this was better than the alternatives from https://stackoverflow.com/questions/865115/how-do-i-correctly-clean-up-a-python-object
1 parent bf028e9 commit 0338831

File tree

2 files changed

+21
-14
lines changed

2 files changed

+21
-14
lines changed

dedupe/api.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import itertools
1010
import logging
1111
import multiprocessing
12-
import os
1312
import pickle
1413
import sqlite3
1514
import tempfile
@@ -175,7 +174,6 @@ def partition(
175174
clusters = self.cluster(pair_scores, threshold)
176175
clusters = self._add_singletons(data, clusters)
177176
clusters = list(clusters)
178-
_cleanup_scores(pair_scores)
179177
return clusters
180178

181179
def _add_singletons(self, data: Data, clusters: Clusters) -> Clusters:
@@ -514,7 +512,6 @@ def join(
514512
links = pair_scores[pair_scores["score"] > threshold]
515513

516514
links = list(links)
517-
_cleanup_scores(pair_scores)
518515
return links
519516

520517
def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:
@@ -1468,14 +1465,3 @@ def flatten_training(
14681465
y.extend([encoded_y] * len(pairs))
14691466

14701467
return examples, numpy.array(y)
1471-
1472-
1473-
def _cleanup_scores(arr: Scores) -> None:
1474-
try:
1475-
mmap_file = arr.filename # type: ignore
1476-
except AttributeError:
1477-
pass
1478-
else:
1479-
del arr
1480-
if mmap_file:
1481-
os.remove(mmap_file)

dedupe/core.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import os
1111
import queue
1212
import tempfile
13+
import weakref
1314
from typing import TYPE_CHECKING, overload
1415

1516
import numpy
@@ -176,9 +177,29 @@ def scoreDuplicates(
176177
else:
177178
scored_pairs = numpy.array([], dtype=dtype)
178179

180+
# Monkeypatch in these extra methods and attributes.
181+
# See https://docs.python.org/3/library/weakref.html#comparing-finalizers-with-del-methods
182+
scored_pairs.remove = weakref.finalize(scored_pairs, _cleanup_scores, scored_pairs) # type: ignore[attr-defined]
183+
scored_pairs.removed = property(_is_removed) # type: ignore[attr-defined]
184+
179185
return scored_pairs
180186

181187

188+
def _cleanup_scores(arr: Scores) -> None:
189+
try:
190+
mmap_file = arr.filename # type: ignore
191+
except AttributeError:
192+
pass
193+
else:
194+
del arr
195+
if mmap_file:
196+
os.remove(mmap_file)
197+
198+
199+
def _is_removed(self):
200+
return not self.remove.alive
201+
202+
182203
def fillQueue(
183204
queue: _Queue, iterable: Iterable[Any], stop_signals: int, chunk_size: int = 20000
184205
) -> None:

0 commit comments

Comments
 (0)