Skip to content

Commit 95c5612

Browse files
authored
Improve speed of header deid with lookup tables and caching (#289)
* Pre-compile regexes and use string comparison where possible During profiling, it was identified that repeated regex lookups inside loops were taking a significant amount of time. To reduce this, regex expressions were pre-compiled outside of loops, and plain string comparison was used when possible to sidestep the performance overhead of regex matching for simple things like string equality comparison. * Use lookup tables to reduce field parsing The `get_fields_with_lookup` function was added to augment the `get_fields` function with a lookup table that allows quick identification of exact tag matches. This optimization significantly reduced the amount of time spent in the exact matching stage of `expand_field_expression`. For top-level DICOM tag searches, the search in "Case 2" would call `name_contains` on every single tag in the DICOM dataset. With lookup tables, we can look up a contender field based on the values for which we're checking exact matches-- this becomes a key lookup problem rather than searching all fields for a matching identifier. * Add cache layer around field retrieval * Fix test failures * Use isinstance for type comparison * Add docstrings to expander functions * Use a wrapper class for field lookup tables * Respond to PR feedback * Fix formatting * Remove case-sensitivity of fields in lookup tables * Remove mentions of whole_string kwarg in docs * Move scope of hash function override to local function * Switch to context manager for cleaner override
1 parent 8ed7a7e commit 95c5612

File tree

6 files changed

+265
-65
lines changed

6 files changed

+265
-65
lines changed

deid/dicom/fields.py

Lines changed: 215 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,12 @@
33
__license__ = "MIT"
44

55
import re
6+
from collections import defaultdict
7+
from contextlib import contextmanager
8+
from copy import deepcopy
9+
from functools import cache
610

11+
from pydicom import FileDataset
712
from pydicom.dataelem import DataElement
813
from pydicom.dataset import Dataset, FileMetaDataset, RawDataElement
914
from pydicom.sequence import Sequence
@@ -47,12 +52,10 @@ def stripped_tag(self):
4752

4853
# Contains
4954

50-
def name_contains(self, expression, whole_string=False):
55+
def name_contains(self, expression):
5156
"""
52-
Determine if a name contains a pattern or expression.
53-
Use whole_string to match the entire string exactly (True),
54-
or partially (False).
55-
Use re to search a field for a regular expression, meaning
57+
Determine if a name contains an expression.
58+
Use expression to search a field, meaning
5659
the name, the keyword (nested) or the string tag.
5760
name.lower: includes nested keywords (e.g., Sequence_Child)
5861
self.element.name: is the human friendly name "Sequence Child"
@@ -62,7 +65,7 @@ def name_contains(self, expression, whole_string=False):
6265
- Patient's Name (tag name)
6366
- patient's name (lowercase tag name)
6467
- PatientName (tag keyword)
65-
- PatientN (tag keyword partial match, with whole_string=False)
68+
- PatientN (tag keyword partial match)
6669
- (0010,0010) (parentheses-enclosed, comma-separated group, element)
6770
- 00100010 (stripped group, element)
6871
@@ -77,14 +80,17 @@ def name_contains(self, expression, whole_string=False):
7780
- PRIVATE_CREATOR: Private creator string in double quotes
7881
- ELEMENT_OFFSET: 2-digit hexadecimal element number (last 8 bits of full element)
7982
"""
80-
regexp_expression = f"^{expression}$" if whole_string else expression
83+
if isinstance(expression, str):
84+
expression = re.compile(expression, re.IGNORECASE)
85+
8186
if (
82-
re.search(regexp_expression, self.name.lower())
83-
or f"({self.element.tag.group:04X},{self.element.tag.element:04X})".lower()
84-
== expression.lower()
85-
or re.search(regexp_expression, self.stripped_tag)
86-
or re.search(regexp_expression, self.element.name)
87-
or re.search(regexp_expression, self.element.keyword)
87+
expression.search(self.name.lower())
88+
or expression.search(
89+
f"({self.element.tag.group:04X},{self.element.tag.element:04X})".lower()
90+
)
91+
or expression.search(self.stripped_tag)
92+
or expression.search(self.element.name)
93+
or expression.search(self.element.keyword)
8894
):
8995
return True
9096

@@ -99,11 +105,10 @@ def name_contains(self, expression, whole_string=False):
99105
# The GROUP is the 4-digit hex group number (e.g., 0033)
100106
# The PRIVATE_CREATOR is the private creator string in quotes
101107
# The ELEMENT_OFFSET is the 2-digit hex element number (masked to last 8 bits)
102-
stripped_private_tag = f'{self.element.tag.group:04X},"{self.element.private_creator}",{(self.element.tag.element&0xFF):02X}'
108+
stripped_private_tag = f'{self.element.tag.group:04X},"{self.element.private_creator}",{(self.element.tag.element & 0xFF):02X}'
103109
private_tag = "(" + stripped_private_tag + ")"
104-
if (
105-
re.search(regexp_expression, stripped_private_tag, re.IGNORECASE)
106-
or private_tag.lower() == expression.lower()
110+
if re.search(expression, stripped_private_tag) or re.search(
111+
expression, private_tag
107112
):
108113
return True
109114
return False
@@ -225,29 +230,25 @@ def expand_field_expression(field, dicom, contenders=None):
225230
allfields: include all fields
226231
exceptfields: filter to all fields except those listed ( | separated)
227232
228-
Returns: a list of DicomField objects
233+
Returns: a dictionary of DicomField objects
229234
"""
230235
# Expanders that don't have a : must be checked for
231236
expanders = ["all"]
232237

233238
# if no contenders provided, use top level of dicom headers
234239
if contenders is None:
235-
contenders = get_fields(dicom)
240+
contenders = get_fields_with_lookup(dicom)
236241

237242
# Case 1: field is an expander without an argument (e.g., no :)
238243
if field.lower() in expanders:
239244
if field.lower() == "all":
240-
fields = contenders
245+
fields = deepcopy(contenders.fields)
241246
return fields
242247

243248
# Case 2: The field is a specific field OR an expander with argument (A:B)
244249
fields = field.split(":", 1)
245250
if len(fields) == 1:
246-
return {
247-
uid: field
248-
for uid, field in contenders.items()
249-
if field.name_contains(fields[0], whole_string=True)
250-
}
251+
return {field.uid: field for field in contenders.get_exact_matches(fields[0])}
251252

252253
# if we get down here, we have an expander and expression
253254
expander, expression = fields
@@ -260,32 +261,205 @@ def expand_field_expression(field, dicom, contenders=None):
260261
elif expander.lower() == "startswith":
261262
expression = "^(%s)" % expression
262263

264+
expression_re = None
263265
# Loop through fields, all are strings STOPPED HERE NEED TO ADDRESS EMPTY NAME
264266
for uid, field in contenders.items():
265-
# Apply expander to string for name OR to tag string
266-
if expander.lower() in ["endswith", "startswith", "contains"]:
267-
if field.name_contains(expression):
268-
fields[uid] = field
267+
if isinstance(field, str) and string_matches_expander(
268+
expander, expression, field
269+
):
270+
fields[uid] = field
271+
elif isinstance(field, DicomField) and field_matches_expander(
272+
expander, expression, expression_re, field
273+
):
274+
fields[uid] = field
269275

270-
elif expander.lower() == "except":
271-
if not field.name_contains(expression):
272-
fields[uid] = field
276+
return fields
273277

274-
elif expander.lower() == "select":
275-
if field.select_matches(expression):
276-
fields[uid] = field
277278

278-
return fields
279+
def string_matches_expander(expander, expression_string, string):
280+
"""
281+
Check if a string matches an expression for a given expander operator.
282+
"""
283+
if expander.lower() in ["endswith", "startswith", "contains"]:
284+
return expression_string in string
285+
286+
elif expander.lower() == "except":
287+
return expression_string not in string
288+
289+
# Note: "select" expanders are not applicable to string values, as
290+
# they do not have any attributes.
291+
return False
292+
293+
294+
def field_matches_expander(expander, expression_string, expression_re, field):
295+
"""
296+
Check if a field matches an expression for a given expander operator.
297+
298+
The `expression_re` argument is an optional arg that can be used to cache
299+
the compiled regex for re-use in later invocations.
300+
"""
301+
# Apply expander to string for name OR to tag string
302+
if expander.lower() == "select":
303+
return field.select_matches(expression_string)
304+
305+
if expression_re is None:
306+
expression_re = re.compile(expression_string, re.IGNORECASE)
307+
308+
if expander.lower() in ["endswith", "startswith", "contains"]:
309+
return field.name_contains(expression_re)
279310

311+
elif expander.lower() == "except":
312+
return not field.name_contains(expression_re)
280313

281-
def get_fields(dicom, skip=None, expand_sequences=True, seen=None):
282-
"""Expand all dicom fields into a list.
314+
return False
315+
316+
317+
class FieldsWithLookups:
318+
"""
319+
This class is a wrapper around a dictionary of DicomField objects keyed by uid,
320+
with some supplemental lookup tables to enable rapid field lookup.
321+
"""
322+
323+
def __init__(self, fields):
324+
self.fields = fields
325+
326+
self.lookup_tables = {
327+
"name": defaultdict(list),
328+
"tag": defaultdict(list),
329+
"stripped_tag": defaultdict(list),
330+
"element_name": defaultdict(list),
331+
"element_keyword": defaultdict(list),
332+
}
333+
for uid, field in fields.items():
334+
self._add_field_to_lookup(field)
335+
336+
def get_exact_matches(self, field):
337+
"""
338+
Get exact case-insensitive matches for a field name or tag.
339+
340+
Returns a list of DicomField objects.
341+
"""
342+
if isinstance(field, str):
343+
field = field.lower()
344+
exact_match_contenders = (
345+
self.lookup_tables["name"][field]
346+
+ self.lookup_tables["tag"][field]
347+
+ self.lookup_tables["stripped_tag"][field]
348+
+ self.lookup_tables["element_name"][field]
349+
+ self.lookup_tables["element_keyword"][field]
350+
)
351+
return exact_match_contenders
352+
353+
def __getitem__(self, key):
354+
return self.fields[key]
355+
356+
def __setitem__(self, key, value):
357+
self.fields[key] = value
358+
359+
def __iter__(self):
360+
return iter(self.fields)
361+
362+
def items(self):
363+
return self.fields.items()
364+
365+
def values(self):
366+
return self.fields.values()
367+
368+
def add(self, uid, field):
369+
self.fields[uid] = field
370+
self._add_field_to_lookup(field)
371+
372+
def _get_field_lookup_keys(self, field):
373+
if not isinstance(field, DicomField):
374+
case_insensitive_field = field.lower()
375+
self.lookup_tables["name"][case_insensitive_field].append(
376+
case_insensitive_field
377+
)
378+
return {"name": [case_insensitive_field]}
379+
lookup_keys = {}
380+
if field.name:
381+
lookup_keys["name"] = [field.name.lower()]
382+
if field.tag:
383+
lookup_keys["tag"] = [field.tag.lower()]
384+
if field.stripped_tag:
385+
lookup_keys["stripped_tag"] = [field.stripped_tag.lower()]
386+
if field.element.name:
387+
lookup_keys["element_name"] = [field.element.name.lower()]
388+
if field.element.keyword:
389+
lookup_keys["element_keyword"] = [field.element.keyword.lower()]
390+
if field.element.is_private:
391+
lookup_keys["name"] = lookup_keys.get("name", []) + [
392+
f'({field.element.tag.group:04X},"{field.element.private_creator}",{(field.element.tag.element & 0x00FF):02X})'.lower(),
393+
f'{field.element.tag.group:04X},"{field.element.private_creator}",{(field.element.tag.element & 0x00FF):02X}'.lower(),
394+
]
395+
return lookup_keys
396+
397+
def _add_field_to_lookup(self, field):
398+
for table_name, lookup_keys in self._get_field_lookup_keys(field).items():
399+
for key in lookup_keys:
400+
self.lookup_tables[table_name][key].append(field)
401+
402+
def remove(self, uid):
403+
if uid not in self.fields:
404+
return
405+
field = self.fields[uid]
406+
for table_name, lookup_keys in self._get_field_lookup_keys(field).items():
407+
for key in lookup_keys:
408+
if field not in self.lookup_tables[table_name][key]:
409+
continue
410+
self.lookup_tables[table_name][key].remove(field)
411+
if not self.lookup_tables[table_name][key]:
412+
del self.lookup_tables[table_name][key]
413+
del self.fields[uid]
414+
415+
416+
@contextmanager
417+
def override_attr(obj, attr, value):
418+
"""
419+
Temporarily override an attribute of an object.
420+
"""
421+
attribute_undefined = not hasattr(obj, attr)
422+
try:
423+
original_value = getattr(obj, attr, None)
424+
setattr(obj, attr, value)
425+
yield
426+
finally:
427+
if attribute_undefined:
428+
delattr(obj, attr)
429+
else:
430+
setattr(obj, attr, original_value)
431+
432+
433+
def get_fields_with_lookup(dicom, skip=None, expand_sequences=True, seen=None):
434+
"""Expand all dicom fields into a list, along with lookup tables keyed on
435+
different field properties.
283436
284437
Each entry is a DicomField. If we find a sequence, we unwrap it and
285438
represent the location with the name (e.g., Sequence__Child)
286439
"""
287-
skip = skip or []
288-
seen = seen or []
440+
# NOTE: this hashing function is required to enable caching on
441+
# `get_fields_inner`. While it is not ideal to override the hashing
442+
# behavior of the PyDicom FileDataset class, it appears to be the
443+
# only way to enable the use of caching without incurring significant
444+
# performance overhead. Note that adding a proxy class around this
445+
# decreases performance substantially (50% slowdown measured).
446+
with override_attr(FileDataset, "__hash__", lambda self: id(self)):
447+
fields, new_seen, new_skip = _get_fields_inner(
448+
dicom,
449+
skip=tuple(skip) if skip else None,
450+
expand_sequences=expand_sequences,
451+
seen=tuple(seen) if seen else None,
452+
)
453+
skip = new_skip
454+
seen = new_seen
455+
456+
return FieldsWithLookups(fields)
457+
458+
459+
@cache
460+
def _get_fields_inner(dicom, skip=None, expand_sequences=True, seen=None):
461+
skip = list(skip) if skip else []
462+
seen = list(seen) if seen else []
289463
fields = {} # indexed by nested tag
290464

291465
if not isinstance(skip, list):
@@ -357,4 +531,4 @@ def add_element(element, name, uid, is_filemeta):
357531
"Unrecognized type %s in extract sequences, skipping." % type(item)
358532
)
359533

360-
return fields
534+
return fields, seen, skip

deid/dicom/groups.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from deid.logger import bot
99

10-
from .fields import expand_field_expression, get_fields
10+
from .fields import expand_field_expression, get_fields_with_lookup
1111

1212

1313
def extract_values_list(dicom, actions, fields=None):
@@ -20,12 +20,14 @@ def extract_values_list(dicom, actions, fields=None):
2020

2121
# The function can be provided fields to save re-parsing
2222
if not fields:
23-
fields = get_fields(dicom)
23+
fields = get_fields_with_lookup(dicom)
2424

2525
for action in actions:
2626
# Extract some subset of fields based on action
2727
subset = expand_field_expression(
28-
field=action["field"], dicom=dicom, contenders=fields
28+
field=action["field"],
29+
dicom=dicom,
30+
contenders=fields,
2931
)
3032

3133
# Just grab the entire value string for a field, no parsing
@@ -72,7 +74,7 @@ def extract_values_list(dicom, actions, fields=None):
7274
return list(values)
7375

7476

75-
def extract_fields_list(dicom, actions, fields=None):
77+
def extract_fields_list(dicom, actions, fields=None, field_lookup_tables=None):
7678
"""Given a list of actions for a named group (a list) extract values from
7779
the dicom based on the list of actions provided. This function
7880
always returns a list intended to update some lookup to be used
@@ -81,13 +83,15 @@ def extract_fields_list(dicom, actions, fields=None):
8183
subset = {}
8284

8385
if not fields:
84-
fields = get_fields(dicom)
86+
fields = get_fields_with_lookup(dicom)
8587

8688
for action in actions:
8789
if action["action"] == "FIELD":
8890
subset.update(
8991
expand_field_expression(
90-
field=action["field"], dicom=dicom, contenders=fields
92+
field=action["field"],
93+
dicom=dicom,
94+
contenders=fields,
9195
)
9296
)
9397

0 commit comments

Comments
 (0)