Skip to content

Commit f26ee5e

Browse files
committed
added series filter and ocr
1 parent 7d77089 commit f26ee5e

File tree

2 files changed

+185
-42
lines changed

2 files changed

+185
-42
lines changed

dicom_filter.py

Lines changed: 179 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,14 @@
77
import pydicom as dicom
88
import cv2
99
import json
10-
from pflog import pflog
10+
# from pflog import pflog
1111
from pydicom.pixel_data_handlers import convert_color_space
1212
import numpy as np
13-
__version__ = '1.2.6'
13+
import re
14+
from PIL import Image
15+
import pytesseract
16+
17+
__version__ = '1.2.7'
1418

1519
DISPLAY_TITLE = r"""
1620
_ _ _ __ _ _ _
@@ -31,18 +35,159 @@
3135
help='comma separated dicom tags with values')
3236
parser.add_argument('-f', '--fileFilter', default='dcm', type=str,
3337
help='input file filter glob')
38+
parser.add_argument('-m', '--minImgCount', default='1', type=int,
39+
help='A configurable threshold—any series with fewer images is dropped.')
3440
parser.add_argument('-V', '--version', action='version',
3541
version=f'%(prog)s {__version__}')
36-
parser.add_argument('-t', '--outputType', default='dcm', type=str,
42+
parser.add_argument('-o', '--outputType', default='dcm', type=str,
3743
help='output file type(extension only)')
38-
parser.add_argument('-e', '--exclude', default=False, action="store_true",
39-
help='True means filter out, False means filter in.')
44+
parser.add_argument('-t', '--textInspect', default=False, action="store_true",
45+
help='True means detect text in images, else no.')
4046
parser.add_argument( '--pftelDB',
4147
dest = 'pftelDB',
4248
default = '',
4349
type = str,
4450
help = 'optional pftel server DB path')
4551

52+
class TagCondition:
53+
def __init__(self, tag, op, values):
54+
self.tag = tag
55+
self.op = op
56+
self.values = values # list for '=' OR values; length 1 otherwise
57+
58+
def __repr__(self):
59+
return f"<TagCondition {self.tag}{self.op}{self.values}>"
60+
61+
OPERATORS = ["!=", ">=", "<=", "=", ">", "<", "~"]
62+
63+
def parse_filter_string(filter_str):
64+
conditions = []
65+
parts = [p.strip() for p in filter_str.split(",") if p.strip()]
66+
67+
for part in parts:
68+
# find operator
69+
op = None
70+
for candidate in OPERATORS:
71+
if candidate in part:
72+
op = candidate
73+
break
74+
if not op:
75+
raise ValueError(f"Invalid filter expression: {part}")
76+
77+
tag, value = part.split(op, 1)
78+
tag = tag.strip().strip('"').strip("'")
79+
value = value.strip().strip('"').strip("'")
80+
81+
# support OR-values for '=' operator: CT/MR/US
82+
if op == "=" and "/" in value:
83+
values = value.split("/")
84+
else:
85+
values = [value]
86+
87+
conditions.append(TagCondition(tag, op, values))
88+
89+
return conditions
90+
91+
def passes_filters(ds, conditions):
92+
for cond in conditions:
93+
try:
94+
elem = ds.data_element(cond.tag)
95+
actual_full = str(elem) # FULL element string (your requirement)
96+
except Exception:
97+
print(f"[{cond.tag}] MISSING TAG → fails condition {cond}")
98+
return False
99+
100+
# This extracts ONLY the value part for numeric comparisons:
101+
# Example elem: "(0008,0020) Study Date DA: '20121126'"
102+
# Extracts "20121126"
103+
try:
104+
actual_value_only = str(elem.value)
105+
except Exception:
106+
actual_value_only = actual_full # fallback
107+
108+
# Expected string for printing
109+
expected_str = "/".join(cond.values) if cond.op == "=" else cond.values[0]
110+
111+
print(f"[{cond.tag}] expected: {cond.op}{expected_str} | actual: {actual_full}")
112+
113+
# ---------------------------------------------------------------------
114+
# 1) Exact or OR matching against the FULL ELEMENT STRING
115+
# ---------------------------------------------------------------------
116+
if cond.op == "=":
117+
if not any(v in actual_full for v in cond.values):
118+
print(" -> FAIL (substring not found in element)")
119+
return False
120+
print(" -> OK")
121+
continue
122+
123+
# ---------------------------------------------------------------------
124+
# 2) Negated match against the FULL ELEMENT STRING
125+
# ---------------------------------------------------------------------
126+
elif cond.op == "!=":
127+
if any(v in actual_full for v in cond.values):
128+
print(" -> FAIL (excluded substring found in element)")
129+
return False
130+
print(" -> OK")
131+
continue
132+
133+
# ---------------------------------------------------------------------
134+
# 3) Numeric comparisons (value-only, not full element)
135+
# ---------------------------------------------------------------------
136+
elif cond.op in [">", "<", ">=", "<="]:
137+
try:
138+
v = float(actual_value_only)
139+
c = float(cond.values[0])
140+
except ValueError:
141+
print(" -> FAIL (cannot extract numeric value)")
142+
return False
143+
144+
result = eval(f"{v} {cond.op} {c}")
145+
print(f" -> {'OK' if result else 'FAIL'}")
146+
147+
if not result:
148+
return False
149+
continue
150+
151+
# ---------------------------------------------------------------------
152+
# 4) Regex (FULL element string)
153+
# ---------------------------------------------------------------------
154+
elif cond.op == "~":
155+
pattern = cond.values[0]
156+
result = bool(re.search(pattern, actual_full))
157+
print(f" -> {'OK' if result else 'FAIL'}")
158+
159+
if not result:
160+
return False
161+
continue
162+
163+
return True
164+
165+
def extract_text_from_pixeldata(ds):
166+
"""Return OCR-ed text from pixel data, or '' if unreadable."""
167+
try:
168+
if 'PixelData' not in ds:
169+
return ""
170+
171+
arr = ds.pixel_array
172+
173+
# Convert numpy array to PIL Image (auto-handles monochrome / RGB)
174+
if arr.ndim == 2:
175+
img = Image.fromarray(arr)
176+
elif arr.ndim == 3:
177+
img = Image.fromarray(arr)
178+
else:
179+
return ""
180+
181+
text = pytesseract.image_to_string(img)
182+
return text.strip()
183+
184+
except Exception as e:
185+
print(f"OCR error: {e}")
186+
return ""
187+
188+
189+
190+
46191

47192
# The main function of this *ChRIS* plugin is denoted by this ``@chris_plugin`` "decorator."
48193
# Some metadata about the plugin is specified here. There is more metadata specified in setup.py.
@@ -56,10 +201,6 @@
56201
min_cpu_limit='1000m', # millicores, e.g. "1000m" = 1 CPU core
57202
min_gpu_limit=0 # set min_gpu_limit=1 to enable GPU
58203
)
59-
@pflog.tel_logTime(
60-
event = 'dicom_filter',
61-
log = 'Filter dicom files'
62-
)
63204
def main(options: Namespace, inputdir: Path, outputdir: Path):
64205
"""
65206
*ChRIS* plugins usually have two positional arguments: an **input directory** containing
@@ -74,9 +215,16 @@ def main(options: Namespace, inputdir: Path, outputdir: Path):
74215
print(DISPLAY_TITLE)
75216

76217
mapper = PathMapper.file_mapper(inputdir, outputdir, glob=f"**/*.{options.fileFilter}",fail_if_empty=False)
218+
219+
# Exit if minimum image count is not met
220+
if len(mapper)<options.minImgCount:
221+
print(f"Total no. of images found ({len(mapper)}) is less than {options.minImgCount}. Exiting analysis..")
222+
return
223+
print(f"Total no. of images found: {len(mapper)}")
224+
77225
for input_file, output_file in mapper:
78226
# Read each input file from the input directory that matches the input filter specified
79-
dcm_img = read_input_dicom(input_file, options.dicomFilter, options.exclude)
227+
dcm_img = read_input_dicom(input_file, options.dicomFilter, options.textInspect)
80228

81229
# check if a valid image file is returned
82230
if dcm_img is None:
@@ -107,46 +255,38 @@ def save_as_image(dcm_file, output_file_path, file_ext):
107255
cv2.imwrite(output_file_path,cv2.cvtColor(pixel_array_numpy,cv2.COLOR_RGB2BGR))
108256

109257

110-
111-
112-
113-
def read_input_dicom(input_file_path, filters, exclude):
258+
def read_input_dicom(input_file_path, filter_expression, inspect_text):
114259
"""
115-
1) Read an input dicom file
116-
2) Check if the dicom headers match the specified filters
117-
3) Return the dicom data set
260+
1) Read an input DICOM file
261+
2) Check if the DICOM headers match the specified filters
262+
3) Return the DICOM dataset if it matches, else None
118263
"""
119-
ds = None
120-
d_filter = json.loads(filters)
264+
conditions = parse_filter_string(filter_expression)
265+
266+
# Read DICOM
121267
try:
122-
print(f"Reading input file : {input_file_path.name}")
123-
ds = dicom.dcmread(str(input_file_path))
268+
print(f"Reading input file: {input_file_path.name}")
269+
ds = dicom.dcmread(str(input_file_path), stop_before_pixels=False)
270+
124271
if 'PixelData' not in ds:
125272
print("No pixel data in this DICOM.")
126273
return None
127274

128275
except Exception as ex:
129-
print(f"unable to read dicom file: {ex} \n")
276+
print(f"Unable to read dicom file: {ex}")
130277
return None
131278

132-
for key, value in d_filter.items():
133-
try:
134-
print(f"expected: {value} found: {ds.data_element(key)} exclude: {exclude} \n")
135-
if any(v in str(ds.data_element(key)) for v in value.split("/")):
136-
continue
137-
else:
138-
if exclude:
139-
return ds
140-
print(f"file: {input_file_path.name} doesn't match filter criteria")
141-
return None
142-
except Exception as ex:
143-
print(f"Exception : {ex}")
144-
return None
279+
# Apply filters with verbose output
280+
print(f"\nApplying filter: {filter_expression}")
281+
match = passes_filters(ds, conditions)
282+
print(f"Result: {'MATCH' if match else 'NO MATCH'}\n")
283+
284+
if inspect_text:
285+
print(extract_text_from_pixeldata(ds))
286+
287+
return ds if match else None
288+
145289

146-
if exclude:
147-
print(f"file: {input_file_path.name} matches filter criteria")
148-
return None
149-
return ds
150290

151291

152292
def save_dicom(dicom_file, output_path):

requirements.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
chris_plugin>=0.3.0
22
opencv-python
33
pydicom
4-
pflog==1.2.26
5-
pftel-client~=1.0.6
4+
# pflog==1.2.26
5+
# pftel-client~=1.0.6
66
# for bug fix on transfer syntax errors
77
pylibjpeg
88
pylibjpeg-libjpeg
9-
python-gdcm
9+
python-gdcm
10+
# for running OCR
11+
pillow
12+
pytesseract

0 commit comments

Comments
 (0)