|
| 1 | +import asyncio |
| 2 | +import json |
| 3 | +import argparse |
| 4 | +import matplotlib.pyplot as plt |
| 5 | +from healthcare_integration import HealthcareDataIntegrationMicroservice |
| 6 | + |
| 7 | +async def evaluate_against_ground_truth(dataset_path, ground_truth_path): |
| 8 | + """Evaluate the Healthcare Integration against human-annotated ground truth.""" |
| 9 | + # Load dataset |
| 10 | + try: |
| 11 | + with open(dataset_path, "r") as f: |
| 12 | + dataset = json.load(f) |
| 13 | + |
| 14 | + with open(ground_truth_path, "r") as f: |
| 15 | + ground_truth = json.load(f) |
| 16 | + except Exception as e: |
| 17 | + print(f"Error loading dataset or ground truth: {str(e)}") |
| 18 | + return |
| 19 | + |
| 20 | + # Initialize integration service |
| 21 | + integrator = HealthcareDataIntegrationMicroservice( |
| 22 | + name="Evaluation Integrator", |
| 23 | + description="Healthcare integrator for evaluation against ground truth" |
| 24 | + ) |
| 25 | + integrator.deploy() |
| 26 | + |
| 27 | + # Track metrics |
| 28 | + results = { |
| 29 | + "condition_detection": { |
| 30 | + "true_positive": 0, |
| 31 | + "false_positive": 0, |
| 32 | + "false_negative": 0 |
| 33 | + }, |
| 34 | + "care_gap_detection": { |
| 35 | + "true_positive": 0, |
| 36 | + "false_positive": 0, |
| 37 | + "false_negative": 0 |
| 38 | + }, |
| 39 | + "medication_resolution": { |
| 40 | + "correct": 0, |
| 41 | + "incorrect": 0 |
| 42 | + }, |
| 43 | + "language_detection": { |
| 44 | + "correct": 0, |
| 45 | + "incorrect": 0 |
| 46 | + } |
| 47 | + } |
| 48 | + |
| 49 | + # Process the dataset |
| 50 | + integration_result = await integrator.integrate_patient_records(dataset) |
| 51 | + |
| 52 | + # Evaluate condition detection |
| 53 | + detected_conditions = set([c["name"].lower() for c in integration_result["health_data"]["conditions"]]) |
| 54 | + true_conditions = set([c.lower() for c in ground_truth["conditions"]]) |
| 55 | + |
| 56 | + for condition in detected_conditions: |
| 57 | + if condition in true_conditions: |
| 58 | + results["condition_detection"]["true_positive"] += 1 |
| 59 | + else: |
| 60 | + results["condition_detection"]["false_positive"] += 1 |
| 61 | + |
| 62 | + for condition in true_conditions: |
| 63 | + if condition not in detected_conditions: |
| 64 | + results["condition_detection"]["false_negative"] += 1 |
| 65 | + |
| 66 | + # Evaluate care gap detection |
| 67 | + detected_gaps = set([g["description"].lower() for g in integration_result["health_data"]["care_gaps"]]) |
| 68 | + true_gaps = set([g.lower() for g in ground_truth["care_gaps"]]) |
| 69 | + |
| 70 | + for gap in detected_gaps: |
| 71 | + if any(true_gap in gap for true_gap in true_gaps): |
| 72 | + results["care_gap_detection"]["true_positive"] += 1 |
| 73 | + else: |
| 74 | + results["care_gap_detection"]["false_positive"] += 1 |
| 75 | + |
| 76 | + for gap in true_gaps: |
| 77 | + if not any(gap in detected_gap for detected_gap in detected_gaps): |
| 78 | + results["care_gap_detection"]["false_negative"] += 1 |
| 79 | + |
| 80 | + # Evaluate language detection |
| 81 | + for i, record in enumerate(dataset): |
| 82 | + if i < len(ground_truth["language_annotations"]): |
| 83 | + true_lang = ground_truth["language_annotations"][i] |
| 84 | + |
| 85 | + # Find the corresponding processed record |
| 86 | + for processed in integration_result.get("original_records", []): |
| 87 | + if processed.get("id") == record.get("id"): |
| 88 | + detected_lang = processed.get("detected_language", "unknown") |
| 89 | + if detected_lang == true_lang: |
| 90 | + results["language_detection"]["correct"] += 1 |
| 91 | + else: |
| 92 | + results["language_detection"]["incorrect"] += 1 |
| 93 | + break |
| 94 | + |
| 95 | + # Calculate metrics |
| 96 | + condition_precision = results["condition_detection"]["true_positive"] / (results["condition_detection"]["true_positive"] + results["condition_detection"]["false_positive"]) if (results["condition_detection"]["true_positive"] + results["condition_detection"]["false_positive"]) > 0 else 0 |
| 97 | + condition_recall = results["condition_detection"]["true_positive"] / (results["condition_detection"]["true_positive"] + results["condition_detection"]["false_negative"]) if (results["condition_detection"]["true_positive"] + results["condition_detection"]["false_negative"]) > 0 else 0 |
| 98 | + condition_f1 = 2 * condition_precision * condition_recall / (condition_precision + condition_recall) if (condition_precision + condition_recall) > 0 else 0 |
| 99 | + |
| 100 | + gap_precision = results["care_gap_detection"]["true_positive"] / (results["care_gap_detection"]["true_positive"] + results["care_gap_detection"]["false_positive"]) if (results["care_gap_detection"]["true_positive"] + results["care_gap_detection"]["false_positive"]) > 0 else 0 |
| 101 | + gap_recall = results["care_gap_detection"]["true_positive"] / (results["care_gap_detection"]["true_positive"] + results["care_gap_detection"]["false_negative"]) if (results["care_gap_detection"]["true_positive"] + results["care_gap_detection"]["false_negative"]) > 0 else 0 |
| 102 | + gap_f1 = 2 * gap_precision * gap_recall / (gap_precision + gap_recall) if (gap_precision + gap_recall) > 0 else 0 |
| 103 | + |
| 104 | + language_accuracy = results["language_detection"]["correct"] / (results["language_detection"]["correct"] + results["language_detection"]["incorrect"]) if (results["language_detection"]["correct"] + results["language_detection"]["incorrect"]) > 0 else 0 |
| 105 | + |
| 106 | + # Print summary |
| 107 | + print("\nEvaluation Results:") |
| 108 | + print(f"Condition Detection:") |
| 109 | + print(f" Precision: {condition_precision:.2%}") |
| 110 | + print(f" Recall: {condition_recall:.2%}") |
| 111 | + print(f" F1 Score: {condition_f1:.2%}") |
| 112 | + |
| 113 | + print(f"\nCare Gap Detection:") |
| 114 | + print(f" Precision: {gap_precision:.2%}") |
| 115 | + print(f" Recall: {gap_recall:.2%}") |
| 116 | + print(f" F1 Score: {gap_f1:.2%}") |
| 117 | + |
| 118 | + print(f"\nLanguage Detection:") |
| 119 | + print(f" Accuracy: {language_accuracy:.2%}") |
| 120 | + |
| 121 | + # Plot results |
| 122 | + fig, ax = plt.subplots(1, 3, figsize=(15, 5)) |
| 123 | + |
| 124 | + # Condition detection metrics |
| 125 | + ax[0].bar(["Precision", "Recall", "F1"], [condition_precision, condition_recall, condition_f1]) |
| 126 | + ax[0].set_ylim(0, 1) |
| 127 | + ax[0].set_title("Condition Detection") |
| 128 | + |
| 129 | + # Care gap detection metrics |
| 130 | + ax[1].bar(["Precision", "Recall", "F1"], [gap_precision, gap_recall, gap_f1]) |
| 131 | + ax[1].set_ylim(0, 1) |
| 132 | + ax[1].set_title("Care Gap Detection") |
| 133 | + |
| 134 | + # Language detection accuracy |
| 135 | + ax[2].bar(["Accuracy"], [language_accuracy]) |
| 136 | + ax[2].set_ylim(0, 1) |
| 137 | + ax[2].set_title("Language Detection") |
| 138 | + |
| 139 | + plt.tight_layout() |
| 140 | + plt.savefig("healthcare_evaluation_results.png") |
| 141 | + print("Results plot saved to healthcare_evaluation_results.png") |
| 142 | + |
| 143 | + return results |
| 144 | + |
| 145 | +if __name__ == "__main__": |
| 146 | + parser = argparse.ArgumentParser(description="Evaluate Healthcare Integration against ground truth") |
| 147 | + parser.add_argument("--dataset", type=str, required=True, help="Path to dataset JSON file") |
| 148 | + parser.add_argument("--ground-truth", type=str, required=True, help="Path to ground truth JSON file") |
| 149 | + args = parser.parse_args() |
| 150 | + |
| 151 | + asyncio.run(evaluate_against_ground_truth(args.dataset, args.ground_truth)) |
0 commit comments