-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathehr_search.rs
More file actions
245 lines (213 loc) · 8.76 KB
/
ehr_search.rs
File metadata and controls
245 lines (213 loc) · 8.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
use zeroentropy_community::{Client, MetadataValue};
use std::collections::HashMap;
use std::time::Duration;
use tokio::time::sleep;
/// Example demonstrating semantic search over Electronic Health Records (EHR)
///
/// This example uses the Medical Transcriptions dataset from Kaggle
/// (https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions)
/// containing ~5000 de-identified medical transcriptions across various specialties.
///
/// Usage:
/// 1. Download the dataset from Kaggle and extract mtsamples.csv to your temp directory
/// (Windows: %TEMP%, Linux/Mac: /tmp)
///
/// 2. Set your API key and run:
/// Windows: set ZEROENTROPY_API_KEY=your-api-key
/// Linux/Mac: export ZEROENTROPY_API_KEY="your-api-key"
/// cargo run --example ehr_search
///
/// This demonstrates how ZeroEntropy enables semantic search across medical records,
/// finding relevant information by meaning rather than exact keyword matches.
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let client = Client::from_env()?;
let collection = "medical_transcriptions";
println!("\n=== ZeroEntropy EHR Search Demo ===\n");
// Create collection
println!("Creating collection '{}'...", collection);
match client.collections().add(collection).await {
Ok(_) => println!("✓ Collection created"),
Err(zeroentropy_community::Error::Conflict(_)) => {
println!("✓ Collection already exists")
}
Err(e) => return Err(e.into()),
}
// Check if we need to index documents
let doc_list = client.documents().get_info_list(collection, Some(1), None).await?;
if doc_list.documents.is_empty() {
println!("\n=== Indexing Medical Transcriptions ===");
let temp_dir = std::env::temp_dir();
let csv_path = temp_dir.join("mtsamples.csv");
println!("Reading CSV from {}...", csv_path.display());
// Read and parse CSV
let csv_content = std::fs::read_to_string(&csv_path)?;
let mut rdr = csv::Reader::from_reader(csv_content.as_bytes());
let mut count = 0;
let max_docs = 100; // Index first 100 for demo (adjust as needed)
println!("Indexing first {} transcriptions...", max_docs);
for (idx, result) in rdr.records().enumerate() {
if count >= max_docs {
break;
}
let record = result?;
// CSV columns: description, medical_specialty, sample_name, transcription, keywords
let description = record.get(0).unwrap_or("");
let specialty = record.get(1).unwrap_or("");
let _sample_name = record.get(2).unwrap_or("");
let transcription = record.get(3).unwrap_or("");
let keywords = record.get(4).unwrap_or("");
if transcription.is_empty() {
continue;
}
// Create metadata
let mut metadata = HashMap::new();
metadata.insert(
"specialty".to_string(),
MetadataValue::String(specialty.to_string()),
);
metadata.insert(
"description".to_string(),
MetadataValue::String(description.to_string()),
);
if !keywords.is_empty() {
metadata.insert(
"keywords".to_string(),
MetadataValue::String(keywords.to_string()),
);
}
// Add document
let doc_id = format!("record_{:04}", idx);
match client
.documents()
.add_text(collection, &doc_id, transcription, Some(metadata))
.await
{
Ok(_) => {
count += 1;
if count % 10 == 0 {
print!(".");
std::io::Write::flush(&mut std::io::stdout())?;
}
}
Err(zeroentropy_community::Error::Conflict(_)) => {
count += 1; // Already exists
}
Err(e) => eprintln!("\nWarning: Failed to index {}: {}", doc_id, e),
}
}
println!("\n✓ Indexed {} medical transcriptions", count);
println!("Waiting for indexing to complete...");
sleep(Duration::from_secs(5)).await;
} else {
println!("✓ Collection already contains documents");
}
// Example clinical queries demonstrating semantic search capabilities
let queries = vec![
(
"Patient with chest pain and shortness of breath",
"Finding cardiovascular symptoms across different documentation styles"
),
(
"History of diabetes and kidney problems",
"Finding related chronic conditions even with varied terminology"
),
(
"Postoperative complications and wound care",
"Surgical follow-up documentation"
),
(
"Mental health assessment and depression screening",
"Psychiatric and behavioral health notes"
),
(
"Imaging findings showing mass or lesion",
"Radiology and pathology reports"
),
];
println!("\n=== Clinical Query Examples ===\n");
for (query, description) in queries {
println!("Query: \"{}\"", query);
println!("Use case: {}", description);
let results = client
.queries()
.top_snippets(
collection,
query,
3, // top 3 results
None, // no filter
Some(true), // include metadata
Some(true), // precise responses for longer context
None, // default reranker
)
.await?;
if results.results.is_empty() {
println!(" No results found\n");
continue;
}
for (i, result) in results.results.iter().enumerate() {
let specialty = result
.metadata
.as_ref()
.and_then(|m| m.get("specialty"))
.and_then(|v| match v {
MetadataValue::String(s) => Some(s.as_str()),
_ => None,
})
.unwrap_or("Unknown");
println!("\n {}. [{}] Score: {:.4}", i + 1, specialty, result.score);
// Show snippet (truncate if too long)
let snippet = if result.content.len() > 200 {
format!("{}...", &result.content[..200])
} else {
result.content.clone()
};
println!(" {}", snippet.replace('\n', " "));
}
println!("\n{}", "─".repeat(80));
}
println!("\n=== Specialty Filtering Example ===\n");
// Demonstrate metadata filtering
println!("Query: 'patient assessment' filtered to Cardiology specialty");
// Build filter as HashMap
let mut filter = HashMap::new();
let mut specialty_filter = HashMap::new();
specialty_filter.insert(
"$eq".to_string(),
serde_json::Value::String("Cardiovascular / Pulmonary".to_string()),
);
filter.insert("specialty".to_string(), serde_json::Value::Object(
specialty_filter.into_iter().collect()
));
let results = client
.queries()
.top_snippets(
collection,
"patient assessment",
3,
Some(filter),
Some(true),
None,
None,
)
.await?;
println!("Found {} cardiology records:", results.results.len());
for (i, result) in results.results.iter().enumerate() {
println!(" {}. Score: {:.4}", i + 1, result.score);
let snippet = &result.content[..result.content.len().min(150)];
println!(" {}...", snippet.replace('\n', " "));
}
println!("\n=== Demo Complete ===");
println!("\nKey capabilities demonstrated:");
println!(" ✓ Semantic search - finds by meaning, not just keywords");
println!(" ✓ Cross-specialty search - works across all medical domains");
println!(" ✓ Metadata filtering - narrow searches by specialty, date, etc.");
println!(" ✓ Ranked results - most relevant findings first");
println!("\nPotential real-world applications:");
println!(" • Clinical decision support - find similar cases");
println!(" • Quality assurance - audit documentation patterns");
println!(" • Research cohort building - identify eligible patients");
println!(" • Prior authorization - find supporting documentation");
println!(" • Medical-legal review - locate relevant encounters");
Ok(())
}