Skip to content

Commit 50507a9

Browse files
Create evaluate.py
1 parent 6e8f91f commit 50507a9

File tree

1 file changed

+151
-0
lines changed

1 file changed

+151
-0
lines changed
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
import asyncio
2+
import json
3+
import argparse
4+
import matplotlib.pyplot as plt
5+
from healthcare_integration import HealthcareDataIntegrationMicroservice
6+
7+
async def evaluate_against_ground_truth(dataset_path, ground_truth_path):
8+
"""Evaluate the Healthcare Integration against human-annotated ground truth."""
9+
# Load dataset
10+
try:
11+
with open(dataset_path, "r") as f:
12+
dataset = json.load(f)
13+
14+
with open(ground_truth_path, "r") as f:
15+
ground_truth = json.load(f)
16+
except Exception as e:
17+
print(f"Error loading dataset or ground truth: {str(e)}")
18+
return
19+
20+
# Initialize integration service
21+
integrator = HealthcareDataIntegrationMicroservice(
22+
name="Evaluation Integrator",
23+
description="Healthcare integrator for evaluation against ground truth"
24+
)
25+
integrator.deploy()
26+
27+
# Track metrics
28+
results = {
29+
"condition_detection": {
30+
"true_positive": 0,
31+
"false_positive": 0,
32+
"false_negative": 0
33+
},
34+
"care_gap_detection": {
35+
"true_positive": 0,
36+
"false_positive": 0,
37+
"false_negative": 0
38+
},
39+
"medication_resolution": {
40+
"correct": 0,
41+
"incorrect": 0
42+
},
43+
"language_detection": {
44+
"correct": 0,
45+
"incorrect": 0
46+
}
47+
}
48+
49+
# Process the dataset
50+
integration_result = await integrator.integrate_patient_records(dataset)
51+
52+
# Evaluate condition detection
53+
detected_conditions = set([c["name"].lower() for c in integration_result["health_data"]["conditions"]])
54+
true_conditions = set([c.lower() for c in ground_truth["conditions"]])
55+
56+
for condition in detected_conditions:
57+
if condition in true_conditions:
58+
results["condition_detection"]["true_positive"] += 1
59+
else:
60+
results["condition_detection"]["false_positive"] += 1
61+
62+
for condition in true_conditions:
63+
if condition not in detected_conditions:
64+
results["condition_detection"]["false_negative"] += 1
65+
66+
# Evaluate care gap detection
67+
detected_gaps = set([g["description"].lower() for g in integration_result["health_data"]["care_gaps"]])
68+
true_gaps = set([g.lower() for g in ground_truth["care_gaps"]])
69+
70+
for gap in detected_gaps:
71+
if any(true_gap in gap for true_gap in true_gaps):
72+
results["care_gap_detection"]["true_positive"] += 1
73+
else:
74+
results["care_gap_detection"]["false_positive"] += 1
75+
76+
for gap in true_gaps:
77+
if not any(gap in detected_gap for detected_gap in detected_gaps):
78+
results["care_gap_detection"]["false_negative"] += 1
79+
80+
# Evaluate language detection
81+
for i, record in enumerate(dataset):
82+
if i < len(ground_truth["language_annotations"]):
83+
true_lang = ground_truth["language_annotations"][i]
84+
85+
# Find the corresponding processed record
86+
for processed in integration_result.get("original_records", []):
87+
if processed.get("id") == record.get("id"):
88+
detected_lang = processed.get("detected_language", "unknown")
89+
if detected_lang == true_lang:
90+
results["language_detection"]["correct"] += 1
91+
else:
92+
results["language_detection"]["incorrect"] += 1
93+
break
94+
95+
# Calculate metrics
96+
condition_precision = results["condition_detection"]["true_positive"] / (results["condition_detection"]["true_positive"] + results["condition_detection"]["false_positive"]) if (results["condition_detection"]["true_positive"] + results["condition_detection"]["false_positive"]) > 0 else 0
97+
condition_recall = results["condition_detection"]["true_positive"] / (results["condition_detection"]["true_positive"] + results["condition_detection"]["false_negative"]) if (results["condition_detection"]["true_positive"] + results["condition_detection"]["false_negative"]) > 0 else 0
98+
condition_f1 = 2 * condition_precision * condition_recall / (condition_precision + condition_recall) if (condition_precision + condition_recall) > 0 else 0
99+
100+
gap_precision = results["care_gap_detection"]["true_positive"] / (results["care_gap_detection"]["true_positive"] + results["care_gap_detection"]["false_positive"]) if (results["care_gap_detection"]["true_positive"] + results["care_gap_detection"]["false_positive"]) > 0 else 0
101+
gap_recall = results["care_gap_detection"]["true_positive"] / (results["care_gap_detection"]["true_positive"] + results["care_gap_detection"]["false_negative"]) if (results["care_gap_detection"]["true_positive"] + results["care_gap_detection"]["false_negative"]) > 0 else 0
102+
gap_f1 = 2 * gap_precision * gap_recall / (gap_precision + gap_recall) if (gap_precision + gap_recall) > 0 else 0
103+
104+
language_accuracy = results["language_detection"]["correct"] / (results["language_detection"]["correct"] + results["language_detection"]["incorrect"]) if (results["language_detection"]["correct"] + results["language_detection"]["incorrect"]) > 0 else 0
105+
106+
# Print summary
107+
print("\nEvaluation Results:")
108+
print(f"Condition Detection:")
109+
print(f" Precision: {condition_precision:.2%}")
110+
print(f" Recall: {condition_recall:.2%}")
111+
print(f" F1 Score: {condition_f1:.2%}")
112+
113+
print(f"\nCare Gap Detection:")
114+
print(f" Precision: {gap_precision:.2%}")
115+
print(f" Recall: {gap_recall:.2%}")
116+
print(f" F1 Score: {gap_f1:.2%}")
117+
118+
print(f"\nLanguage Detection:")
119+
print(f" Accuracy: {language_accuracy:.2%}")
120+
121+
# Plot results
122+
fig, ax = plt.subplots(1, 3, figsize=(15, 5))
123+
124+
# Condition detection metrics
125+
ax[0].bar(["Precision", "Recall", "F1"], [condition_precision, condition_recall, condition_f1])
126+
ax[0].set_ylim(0, 1)
127+
ax[0].set_title("Condition Detection")
128+
129+
# Care gap detection metrics
130+
ax[1].bar(["Precision", "Recall", "F1"], [gap_precision, gap_recall, gap_f1])
131+
ax[1].set_ylim(0, 1)
132+
ax[1].set_title("Care Gap Detection")
133+
134+
# Language detection accuracy
135+
ax[2].bar(["Accuracy"], [language_accuracy])
136+
ax[2].set_ylim(0, 1)
137+
ax[2].set_title("Language Detection")
138+
139+
plt.tight_layout()
140+
plt.savefig("healthcare_evaluation_results.png")
141+
print("Results plot saved to healthcare_evaluation_results.png")
142+
143+
return results
144+
145+
if __name__ == "__main__":
146+
parser = argparse.ArgumentParser(description="Evaluate Healthcare Integration against ground truth")
147+
parser.add_argument("--dataset", type=str, required=True, help="Path to dataset JSON file")
148+
parser.add_argument("--ground-truth", type=str, required=True, help="Path to ground truth JSON file")
149+
args = parser.parse_args()
150+
151+
asyncio.run(evaluate_against_ground_truth(args.dataset, args.ground_truth))

0 commit comments

Comments
 (0)