Skip to content

Commit 34ad412

Browse files
committed
Cu 8699qbr8e Multiprocessing empty batches (#35)
* CU-8699qbr8e: Fix issue with multiprocessing in case of empty batches * CU-8699qbr8e: Add a few simple tests to make sure empty and small datasets can be multiprocessed
1 parent 0039530 commit 34ad412

File tree

2 files changed

+23
-0
lines changed

2 files changed

+23
-0
lines changed

medcat-v2/medcat/cat.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,9 @@ def _mp_one_batch_per_process(
265265
executor.submit(self._mp_worker_func, batch))
266266
except StopIteration:
267267
break
268+
if not futures:
269+
# NOTE: if there wasn't any data, we didn't process anything
270+
return
268271
# Main process works on next batch while workers are busy
269272
main_batch: Optional[list[tuple[str, str, bool]]]
270273
try:
@@ -282,6 +285,10 @@ def _mp_one_batch_per_process(
282285
# so we're going to wait for them to finish, yield their results,
283286
# and subsequently submit the next batch to keep them busy
284287
for _ in range(external_processes):
288+
if not futures:
289+
# NOTE: if there's no futures then there can't be
290+
# anything to batch
291+
break
285292
# Wait for any future to complete
286293
done_future = next(as_completed(futures))
287294
futures.remove(done_future)

medcat-v2/tests/test_cat.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,13 +418,29 @@ def test_can_get_multiple_entities(self):
418418
"The dog is sitting outside the house."
419419
]
420420
ents = list(self.cat.get_entities_multi_texts(texts))
421+
self.assert_ents(ents, texts)
422+
423+
def assert_ents(self, ents: list[tuple], texts: list[str]):
421424
self.assertEqual(len(ents), len(texts))
422425
# NOTE: text IDs are integers starting from 0
423426
exp_ids = set(str(i) for i in range(len(texts)))
424427
for ent_id_str, ent in ents:
425428
with self.subTest(f"Entity: {ent_id_str} [{ent}]"):
426429
self.assertIn(ent_id_str, exp_ids)
427430

431+
def test_can_multiprocess_empty(self):
432+
texts = []
433+
ents = list(self.cat.get_entities_multi_texts(texts, n_process=3))
434+
self.assert_ents(ents, texts)
435+
436+
def test_can_get_multiprocess(self):
437+
texts = [
438+
"The fittest most fit of chronic kidney failure",
439+
"The dog is sitting outside the house."
440+
]
441+
ents = list(self.cat.get_entities_multi_texts(texts, n_process=3))
442+
self.assert_ents(ents, texts)
443+
428444

429445
class CATWithDocAddonTests(CATIncludingTests):
430446
EXAMPLE_TEXT = "Example text to tokenize"

0 commit comments

Comments
 (0)