Skip to content

Commit 354d1bf

Browse files
chaos-dotcomclaude
andcommitted
HRT-0004: extract sample collection date from PDF/OCR text
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 69cf44b commit 354d1bf

File tree

1 file changed

+99
-0
lines changed

1 file changed

+99
-0
lines changed

crates/web/src/pages/create_blood_test.rs

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct OcrExtraction {
5050
prolactin: Option<OcrValue>,
5151
shbg: Option<OcrValue>,
5252
fai: Option<OcrValue>,
53+
sample_date: Option<String>,
5354
}
5455

5556
#[derive(Clone, Debug, Deserialize)]
@@ -85,8 +86,13 @@ fn apply_ocr_extraction(
8586
shbg_level: RwSignal<String>,
8687
shbg_unit: RwSignal<String>,
8788
free_androgen_index: RwSignal<String>,
89+
test_date_time: RwSignal<String>,
8890
) -> usize {
8991
let mut filled = 0;
92+
if let Some(date) = extracted.sample_date {
93+
test_date_time.set(date);
94+
filled += 1;
95+
}
9096
if let Some(value) = extracted.estradiol {
9197
estradiol_level.set(value.value);
9298
if let Some(unit) = value.unit {
@@ -259,6 +265,94 @@ fn extract_ocr_value(text: &str, labels: &[&str]) -> Option<OcrValue> {
259265
None
260266
}
261267

268+
fn month_from_name(name: &str) -> Option<u32> {
269+
match name {
270+
"jan" | "january" => Some(1),
271+
"feb" | "february" => Some(2),
272+
"mar" | "march" => Some(3),
273+
"apr" | "april" => Some(4),
274+
"may" => Some(5),
275+
"jun" | "june" => Some(6),
276+
"jul" | "july" => Some(7),
277+
"aug" | "august" => Some(8),
278+
"sep" | "september" => Some(9),
279+
"oct" | "october" => Some(10),
280+
"nov" | "november" => Some(11),
281+
"dec" | "december" => Some(12),
282+
_ => None,
283+
}
284+
}
285+
286+
fn try_parse_date(tokens: &[&str]) -> Option<String> {
287+
if tokens.is_empty() {
288+
return None;
289+
}
290+
// Try DD/MM/YYYY or DD-MM-YYYY
291+
let first = tokens[0];
292+
for sep in ['/', '-'] {
293+
let parts: Vec<&str> = first.split(sep).collect();
294+
if parts.len() == 3 {
295+
let a: u32 = parts[0].parse().ok()?;
296+
let b: u32 = parts[1].parse().ok()?;
297+
let c: u32 = parts[2].parse().ok()?;
298+
let (year, month, day) = if c > 100 {
299+
// DD/MM/YYYY
300+
(c, b, a)
301+
} else if a > 100 {
302+
// YYYY/MM/DD
303+
(a, b, c)
304+
} else {
305+
return None;
306+
};
307+
if (1..=12).contains(&month) && (1..=31).contains(&day) {
308+
return Some(format!("{year:04}-{month:02}-{day:02}T12:00"));
309+
}
310+
}
311+
}
312+
// Try DD MMM YYYY (tokens: ["15", "Mar", "2025"])
313+
if tokens.len() >= 3 {
314+
if let Ok(day) = tokens[0].parse::<u32>() {
315+
if let Some(month) = month_from_name(&tokens[1].to_lowercase()) {
316+
if let Ok(year) = tokens[2].trim_end_matches(|c: char| !c.is_ascii_digit()).parse::<u32>() {
317+
if (1..=31).contains(&day) && year > 1900 {
318+
return Some(format!("{year:04}-{month:02}-{day:02}T12:00"));
319+
}
320+
}
321+
}
322+
}
323+
}
324+
None
325+
}
326+
327+
fn extract_sample_date(text: &str) -> Option<String> {
328+
let lower = text.to_lowercase();
329+
let labels = [
330+
"sample collection date",
331+
"collection date",
332+
"date collected",
333+
"date of collection",
334+
"sample date",
335+
"specimen collection date",
336+
"specimen date",
337+
"collected on",
338+
"collected date",
339+
"date of sample",
340+
];
341+
for label in labels {
342+
if let Some(idx) = lower.find(label) {
343+
let start = idx + label.len();
344+
let end = (start + 80).min(text.len());
345+
let window = &text[start..end];
346+
let trimmed = window.trim_start_matches([':', ' ', '\t']);
347+
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
348+
if let Some(date) = try_parse_date(&tokens) {
349+
return Some(date);
350+
}
351+
}
352+
}
353+
None
354+
}
355+
262356
fn extract_ocr_values(text: &str) -> OcrExtraction {
263357
let cleaned = text.replace('\r', "\n");
264358
OcrExtraction {
@@ -280,6 +374,7 @@ fn extract_ocr_values(text: &str) -> OcrExtraction {
280374
prolactin: extract_ocr_value(&cleaned, &["prolactin"]),
281375
shbg: extract_ocr_value(&cleaned, &["sex hormone binding globulin", "shbg"]),
282376
fai: extract_ocr_value(&cleaned, &["free androgen index", "fai"]),
377+
sample_date: extract_sample_date(&cleaned),
283378
}
284379
}
285380

@@ -369,6 +464,7 @@ pub fn CreateBloodTest() -> impl IntoView {
369464
let shbg_level = shbg_level;
370465
let shbg_unit = shbg_unit;
371466
let free_androgen_index = free_androgen_index;
467+
let test_date_time = test_date_time;
372468
move |ev: leptos::ev::Event| {
373469
if ocr_busy.get() {
374470
return;
@@ -481,6 +577,7 @@ pub fn CreateBloodTest() -> impl IntoView {
481577
shbg_level,
482578
shbg_unit,
483579
free_androgen_index,
580+
test_date_time,
484581
);
485582
if filled == 0 {
486583
ocr_error.set(Some("OCR ran, but no lab values were found.".to_string()));
@@ -523,6 +620,7 @@ pub fn CreateBloodTest() -> impl IntoView {
523620
let shbg_level = shbg_level;
524621
let shbg_unit = shbg_unit;
525622
let free_androgen_index = free_androgen_index;
623+
let test_date_time = test_date_time;
526624
move |ev: leptos::ev::Event| {
527625
if pdf_busy.get() {
528626
return;
@@ -643,6 +741,7 @@ pub fn CreateBloodTest() -> impl IntoView {
643741
shbg_level,
644742
shbg_unit,
645743
free_androgen_index,
744+
test_date_time,
646745
);
647746
}
648747
if item.extract_error.is_some() {

0 commit comments

Comments
 (0)