Skip to content

Commit 01ad04c

Browse files
authored
CU-8699py5m0 Multiprocessing issues (#34)
* CU-8699py5m0: Fix ordering of text/index when batching on a per char length basis * CU-8699py5m0: Set addon data paths for other threads upon multiprocessing * CU-8699py5m0: Fix ordering of text and index when doing sequentially * CU-8699py5m0: Update tests with correct order of text/index for multiprocessing
1 parent b251ead commit 01ad04c

File tree

2 files changed

+24
-4
lines changed

2 files changed

+24
-4
lines changed

medcat-v2/medcat/cat.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from medcat.utils.defaults import doing_legacy_conversion_message
3232
from medcat.utils.defaults import LegacyConversionDisabledError
3333
from medcat.utils.usage_monitoring import UsageMonitor
34+
from medcat.utils.import_utils import MissingDependenciesError
3435

3536

3637
logger = logging.getLogger(__name__)
@@ -157,6 +158,25 @@ def _mp_worker_func(
157158
self,
158159
texts_and_indices: list[tuple[str, str, bool]]
159160
) -> list[tuple[str, str, Union[dict, Entities, OnlyCUIEntities]]]:
161+
# NOTE: this is needed for subprocess as otherwise they wouldn't have
162+
# any of these set
163+
# NOTE: these need to by dynamic in case the extra's aren't included
164+
try:
165+
from medcat.components.addons.meta_cat import MetaCATAddon
166+
has_meta_cat = True
167+
except MissingDependenciesError:
168+
has_meta_cat = False
169+
try:
170+
from medcat.components.addons.relation_extraction.rel_cat import (
171+
RelCATAddon)
172+
has_rel_cat = True
173+
except MissingDependenciesError:
174+
has_rel_cat = False
175+
for addon in self._pipeline.iter_addons():
176+
if has_meta_cat and isinstance(addon, MetaCATAddon):
177+
addon._init_data_paths(self._pipeline.tokenizer)
178+
elif has_rel_cat and isinstance(addon, RelCATAddon):
179+
addon._rel_cat._init_data_paths()
160180
return [
161181
(text, text_index, self.get_entities(text, only_cui=only_cui))
162182
for text, text_index, only_cui in texts_and_indices]
@@ -180,7 +200,7 @@ def _generate_batches_by_char_length(
180200
yield docs
181201
docs = []
182202
char_count = clen
183-
docs.append((doc_index, doc, only_cui))
203+
docs.append((doc, doc_index, only_cui))
184204

185205
if len(docs) > 0:
186206
yield docs
@@ -326,7 +346,7 @@ def get_entities_multi_texts(
326346
if n_process == 1:
327347
# just do in series
328348
for batch in batch_iter:
329-
for text_index, _, result in self._mp_worker_func(batch):
349+
for _, text_index, result in self._mp_worker_func(batch):
330350
yield text_index, result
331351
return
332352

medcat-v2/tests/test_cat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -728,7 +728,7 @@ def test_batching_gets_full_char(self):
728728
# has all texts
729729
self.assertEqual(sum(len(batch) for batch in batches), self.NUM_TEXTS)
730730
# has all characters
731-
self.assertEqual(sum(len(text[1]) for text in batches[0]),
731+
self.assertEqual(sum(len(text[0]) for text in batches[0]),
732732
self.total_text_length)
733733

734734
def test_batching_gets_all_half_at_a_time(self):
@@ -746,7 +746,7 @@ def test_batching_gets_all_half_at_a_time(self):
746746
# has all texts
747747
self.assertEqual(sum(len(batch) for batch in batches), self.NUM_TEXTS)
748748
# has all characters
749-
self.assertEqual(sum(len(text[1])
749+
self.assertEqual(sum(len(text[0])
750750
for batch in batches for text in batch),
751751
self.total_text_length)
752752

0 commit comments

Comments
 (0)