Merge pull request #6 from rvandewater/task-inspection

rvandewater · web-flow · commit 24e06fa86364 · 2025-02-04T16:33:06.000+01:00
Task inspection
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -2,3 +2,4 @@ include src/MEDS_Inspect/assets/MIMIC-IV-DEMO-MEDS/data/train/*
 include src/MEDS_Inspect/assets/MIMIC-IV-DEMO-MEDS/data/tuning/*
 include src/MEDS_Inspect/assets/MIMIC-IV-DEMO-MEDS/data/held_out/*
 include src/MEDS_Inspect/assets/MIMIC-IV-DEMO-MEDS/metadata/*
+include src/MEDS_Inspect/assets/MIMIC-IV-DEMO-MEDS/tasks/*
diff --git a/src/MEDS_Inspect/app.py b/src/MEDS_Inspect/app.py
@@ -1,5 +1,7 @@
 import importlib.resources as pkg_resources
+import os
 
+import pandas as pd
 import plotly.express as px
 import polars as pl
 from dash import Dash, Input, Output, State, dash_table, dcc, html
@@ -148,6 +150,7 @@ def render_content(tab, file_path):
         # codes = top_codes['code'].unique().to_list()
 
         numerical_codes = numerical_code_data.select("code").unique().collect()["code"].to_list()
+
         if tab == "tab-1":
             fig_code_count_years = px.histogram(
                 code_count_years, x="Date", y="Amount of codes", nbins=len(code_count_years)
@@ -278,13 +281,22 @@ def render_content(tab, file_path):
                     html.H2(children="Codes over time for a single patient", style={"textAlign": "center"}),
                     dcc.Dropdown(
                         id="patient-dropdown",
-                        options=[{"label": pid, "value": pid} for pid in subject_ids],
+                        options=[{"label": pid, "value": pid} for pid in subject_ids[:1000]],
                         placeholder="Select a patient ID",
+                        value=None,
+                        multi=False,
+                        searchable=True,
+                        clearable=True,
+                        style={"width": "100%"},
+                    ),
+                    dcc.Dropdown(
+                        id="task-dropdown",
+                        placeholder="Select a task",
                     ),
                     dcc.Loading(
                         id="loading-fig-patient-codes",
                         type="default",
-                        children=dcc.Graph(id="fig_patient_codes", style={"width": "90hh", "height": "50vh"}),
+                        children=dcc.Graph(id="fig_patient_codes", style={"width": "90hh", "height": "90vh"}),
                     ),
                 ],
                 style=card_style,
@@ -530,25 +542,84 @@ def update_top_codes(top_n, scale):
         )
         return fig_top_codes
 
-    @app.callback(Output("fig_patient_codes", "figure"), Input("patient-dropdown", "value"))
-    def update_patient_codes(patient_id):
+    @app.callback(
+        Output("fig_patient_codes", "figure"),
+        Output("task-dropdown", "options"),
+        Input("patient-dropdown", "value"),
+        Input("hidden-file-path", "value"),
+        Input("task-dropdown", "value"),
+    )
+    def update_patient_codes_and_task_dropdown(patient_id, file_path, selected_task):
+        if file_path:
+            tasks_path = os.path.join(file_path, "tasks")
+            detected_tasks = [
+                f for f in os.listdir(tasks_path) if os.path.isfile(os.path.join(tasks_path, f))
+            ]
+            task_options = [{"label": os.path.splitext(task)[0], "value": task} for task in detected_tasks]
+        else:
+            task_options = []
+
         if patient_id is None:
-            return {}
+            return {}, task_options
 
         patient_data = (
             pl.scan_parquet(return_data_path(file_path))
             .filter(pl.col("subject_id") == patient_id)
-            .select(pl.col("time"), pl.col("code"))
+            .select(pl.col("time"), pl.col("code"), pl.col("numeric_value"), pl.col("text_value"))
+            .with_columns(pl.col("code").str.split("/").list.first().alias("coding_dict"))
             .collect()
         )
 
         if patient_data.is_empty():
-            return {}
+            return {}, task_options
 
+        # Create the scatter plot with color based on the category
         fig_patient_codes = px.scatter(
-            patient_data, x="time", y="code", title=f"Codes over time for patient {patient_id}"
+            patient_data,
+            x="time",
+            y="code",
+            color="coding_dict",
+            title=f"Codes over time for patient {patient_id}",
+            labels={"coding_dict": "Code Category"},
+            hover_data={"code": True, "numeric_value": True, "text_value": True},
         )
-        return fig_patient_codes
+
+        if selected_task:
+            task_file_path = os.path.join(file_path, "tasks", selected_task)
+            if os.path.isfile(task_file_path):
+                task_data = pl.scan_parquet(task_file_path)
+                task_label = task_data.filter(pl.col("subject_id") == patient_id).collect()
+                # task_label.with_columns(pl.col("prediction_time").cast())
+                if not task_label.is_empty():
+                    # Workaround for plotly that does not allow datetime values
+                    for row in task_label.iter_rows(named=True):
+                        prediction_time_timestamp = row["prediction_time"].timestamp() * 1000
+                        task_name = os.path.splitext(selected_task)[0]
+                        color = "red" if row.get("boolean_value", False) else "green"
+
+                        hover_text = f"Task: {task_name}<br>Prediction Time: {row['prediction_time']}"
+                        if "boolean_value" in row and row["boolean_value"] is not None:
+                            hover_text += f"<br>Boolean Value: {row['boolean_value']}"
+                        if "integer_value" in row and row["integer_value"] is not None:
+                            hover_text += f"<br>Integer Value: {row['integer_value']}"
+                        if "float_value" in row and row["float_value"] is not None:
+                            hover_text += f"<br>Float Value: {row['float_value']}"
+                        if "categorical_value" in row and row["categorical_value"] is not None:
+                            hover_text += f"<br>Categorical Value: {row['categorical_value']}"
+
+                        fig_patient_codes.add_scatter(
+                            x=[prediction_time_timestamp, prediction_time_timestamp],
+                            y=[0, 1],
+                            mode="lines",
+                            line=dict(color=color, dash="dash"),
+                            customdata=pd.Series(data=row),
+                            hovertemplate=hover_text,
+                            name=task_name + f" {row["prediction_time"]}",
+                            yaxis="y2",
+                        )
+                        fig_patient_codes.update_layout(yaxis2=dict(showticklabels=False))
+
+        return fig_patient_codes, task_options
 
     @app.callback(
         Output("fig_code_distribution", "figure"),
diff --git a/src/MEDS_Inspect/assets/MIMIC-IV-DEMO-MEDS/tasks/ICU Mortality first 24h.parquet b/src/MEDS_Inspect/assets/MIMIC-IV-DEMO-MEDS/tasks/ICU Mortality first 24h.parquet
diff --git a/src/MEDS_Inspect/utils.py b/src/MEDS_Inspect/utils.py
@@ -42,3 +42,9 @@ def return_data_path(file_path):
     else:
         logging.error("No data found in the specified paths.")
         return None
+
+
+def get_detected_tasks(file_path):
+    tasks_path = os.path.join(file_path, "tasks")
+    detected_tasks = [f for f in os.listdir(tasks_path) if os.path.isfile(os.path.join(tasks_path, f))]
+    return detected_tasks
diff --git a/task_extraction.txt b/task_extraction.txt
@@ -0,0 +1 @@
+/Users/robin/Documents/git/MEDS-DEV/src/MEDS_DEV/tasks/mortality/in_icu/first_24h.yaml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+/Users/robin/Documents/git/MEDS-DEV/src/MEDS_DEV/tasks/mortality/in_icu/first_24h.yaml`