Update the default task cache path to include task parameter names and values (#766)

EricSchrock · web-flow · commit 82011b0ae700 · 2026-01-02T14:57:21.000-05:00
* Document ReadmissionPredictionMIMIC3 as a class instead of a function

* Call `close` on sample datasets

* Remove the TemporaryDirectory.cleanup() calls as cleanup will be called automatically when the current context is exited

* Updated the default task cache path to include task parameter names and values

* Use UUID v5 in task cache names

* Update task cache_dir docs for UUID v5 based default
diff --git a/docs/api/tasks/pyhealth.tasks.readmission_prediction.rst b/docs/api/tasks/pyhealth.tasks.readmission_prediction.rst
@@ -1,7 +1,11 @@
 ﻿pyhealth.tasks.readmission_prediction
 =======================================
 
-.. autofunction:: pyhealth.tasks.readmission_prediction.ReadmissionPredictionMIMIC3
+.. autoclass:: pyhealth.tasks.readmission_prediction.ReadmissionPredictionMIMIC3
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 .. autofunction:: pyhealth.tasks.readmission_prediction.readmission_prediction_mimic4_fn
 .. autofunction:: pyhealth.tasks.readmission_prediction.readmission_prediction_eicu_fn
 .. autofunction:: pyhealth.tasks.readmission_prediction.readmission_prediction_eicu_fn2
diff --git a/pyhealth/datasets/base_dataset.py b/pyhealth/datasets/base_dataset.py
@@ -118,7 +118,7 @@ def _csv_tsv_gz_path(path: str) -> str:
 class _ProgressContext:
     def __init__(self, queue: multiprocessing.queues.Queue | None, total: int, **kwargs):
         """
-        :param queue: An existing queue (e.g., from multiprocessing). If provided, 
+        :param queue: An existing queue (e.g., from multiprocessing). If provided,
                       this class acts as a passthrough.
         :param total: Total items for the progress bar (only used if queue is None).
         :param kwargs: Extra arguments for tqdm (e.g., desc="Processing").
@@ -135,7 +135,7 @@ def put(self, n):
     def __enter__(self):
         if self.queue:
             return self.queue
-        
+
         self.progress = tqdm(total=self.total, **self.kwargs)
         return self
 
@@ -158,7 +158,7 @@ def _task_transform_init(queue: multiprocessing.queues.Queue) -> None:
 def _task_transform_fn(args: tuple[int, BaseTask, Iterable[str], pl.LazyFrame, Path]) -> None:
     """
     Worker function to apply task transformation on a chunk of patients.
-    
+
     Args:
         args (tuple): A tuple containing:
             worker_id (int): The ID of the worker.
@@ -171,13 +171,13 @@ def _task_transform_fn(args: tuple[int, BaseTask, Iterable[str], pl.LazyFrame, P
     worker_id, task, patient_ids, global_event_df, output_dir = args
     total_patients = len(list(patient_ids))
     logger.info(f"Worker {worker_id} started processing {total_patients} patients. (Polars threads: {pl.thread_pool_size()})")
-    
+
     with (
-        set_env(DATA_OPTIMIZER_GLOBAL_RANK=str(worker_id)), 
+        set_env(DATA_OPTIMIZER_GLOBAL_RANK=str(worker_id)),
         _ProgressContext(_task_transform_progress, total=total_patients) as progress
     ):
         writer = BinaryWriter(cache_dir=str(output_dir), chunk_bytes="64MB")
-            
+
         write_index = 0
         batches = itertools.batched(patient_ids, BATCH_SIZE)
         for batch in batches:
@@ -210,11 +210,11 @@ def _proc_transform_init(queue: multiprocessing.queues.Queue) -> None:
     """
     global _proc_transform_progress
     _proc_transform_progress = queue
-    
+
 def _proc_transform_fn(args: tuple[int, Path, int, int, Path]) -> None:
     """
     Worker function to apply processors on a chunk of samples.
-    
+
     Args:
         args (tuple): A tuple containing:
             worker_id (int): The ID of the worker.
@@ -233,15 +233,15 @@ def _proc_transform_fn(args: tuple[int, Path, int, int, Path]) -> None:
         _ProgressContext(_proc_transform_progress, total=total_samples) as progress
     ):
         writer = BinaryWriter(cache_dir=str(output_dir), chunk_bytes="64MB")
-    
+
         dataset = litdata.StreamingDataset(str(task_df))
         complete = 0
         with open(f"{output_dir}/schema.pkl", "rb") as f:
             metadata = pickle.load(f)
 
             input_processors = metadata["input_processors"]
             output_processors = metadata["output_processors"]
-            
+
             write_index = 0
             for i in range(start_idx, end_idx):
                 transformed: Dict[str, Any] = {}
@@ -255,7 +255,7 @@ def _proc_transform_fn(args: tuple[int, Path, int, int, Path]) -> None:
                 writer.add_item(write_index, transformed)
                 write_index += 1
                 complete += 1
-            
+
                 if complete >= BATCH_SIZE:
                     progress.put(complete)
                     complete = 0
@@ -680,7 +680,7 @@ def default_task(self) -> Optional[BaseTask]:
 
     def _task_transform(self, task: BaseTask, output_dir: Path, num_workers: int) -> None:
         self._main_guard(self._task_transform.__name__)
-        
+
         logger.info(f"Applying task transformations on data with {num_workers} workers...")
         global_event_df = task.pre_filter(self.global_event_df)
         patient_ids = (
@@ -691,16 +691,16 @@ def _task_transform(self, task: BaseTask, output_dir: Path, num_workers: int) ->
             # .sort can reduce runtime by 5%.
             .sort()
         )
-        
+
         if in_notebook():
             logger.info("Detected Jupyter notebook environment, setting num_workers to 1")
             num_workers = 1
         num_workers = min(num_workers, len(patient_ids)) # Avoid spawning empty workers
-        
+
         # This ensures worker's polars threads are limited to avoid oversubscription,
         # which can lead to additional 75% speedup when num_workers is large.
         threads_per_worker = max(1, (os.cpu_count() or 1) // num_workers)
-        
+
         try:
             with set_env(POLARS_MAX_THREADS=str(threads_per_worker), DATA_OPTIMIZER_NUM_WORKERS=str(num_workers)):
                 if num_workers == 1:
@@ -727,7 +727,7 @@ def _task_transform(self, task: BaseTask, output_dir: Path, num_workers: int) ->
                                 progress.update(queue.get(timeout=1))
                             except:
                                 pass
-                                
+
                         # remaining items
                         while not queue.empty():
                             progress.update(queue.get())
@@ -739,17 +739,17 @@ def _task_transform(self, task: BaseTask, output_dir: Path, num_workers: int) ->
             logger.error(f"Error during task transformation, cleaning up output directory: {output_dir}")
             shutil.rmtree(output_dir)
             raise e
-                
+
     def _proc_transform(self, task_df: Path, output_dir: Path, num_workers: int) -> None:
         self._main_guard(self._proc_transform.__name__)
-        
+
         logger.info(f"Applying processors on data with {num_workers} workers...")
         num_samples = len(litdata.StreamingDataset(str(task_df)))
-            
+
         if in_notebook():
             logger.info("Detected Jupyter notebook environment, setting num_workers to 1")
             num_workers = 1
-        
+
         num_workers = min(num_workers, num_samples) # Avoid spawning empty workers
         try:
             with set_env(DATA_OPTIMIZER_NUM_WORKERS=str(num_workers)):
@@ -758,7 +758,7 @@ def _proc_transform(self, task_df: Path, output_dir: Path, num_workers: int) ->
                     _proc_transform_fn((0, task_df, 0, num_samples, output_dir))
                     BinaryWriter(cache_dir=str(output_dir), chunk_bytes="64MB").merge(num_workers)
                     return
-                
+
                 ctx = multiprocessing.get_context("spawn")
                 queue = ctx.Queue()
                 linspace = more_itertools.sliding_window(np.linspace(0, num_samples, num_workers + 1, dtype=int), 2)
@@ -777,7 +777,7 @@ def _proc_transform(self, task_df: Path, output_dir: Path, num_workers: int) ->
                                 progress.update(queue.get(timeout=1))
                             except:
                                 pass
-                                
+
                         # remaining items
                         while not queue.empty():
                             progress.update(queue.get())
@@ -814,8 +814,8 @@ def set_task(
         Args:
             task (Optional[BaseTask]): The task to set. Uses default task if None.
             num_workers (int): Number of workers for multi-threading. Default is `self.num_workers`.
-            cache_dir (Optional[str]): Directory to cache samples after task transformation, 
-                but without applying processors. Default is {self.cache_dir}/tasks/{task_name}.
+            cache_dir (Optional[str]): Directory to cache samples after task transformation,
+                but without applying processors. Default is {self.cache_dir}/tasks/{task_name}_{uuid5(vars(task))}.
             cache_format (str): Deprecated. Only "parquet" is supported now.
             input_processors (Optional[Dict[str, FeatureProcessor]]):
                 Pre-fitted input processors. If provided, these will be used
@@ -835,7 +835,7 @@ def set_task(
         if task is None:
             assert self.default_task is not None, "No default tasks found"
             task = self.default_task
-            
+
         if num_workers is None:
             num_workers = self.num_workers
 
@@ -846,8 +846,14 @@ def set_task(
             f"Setting task {task.task_name} for {self.dataset_name} base dataset..."
         )
 
+        task_params = json.dumps(
+            vars(task),
+            sort_keys=True,
+            default=str
+        )
+
         if cache_dir is None:
-            cache_dir = self.cache_dir / "tasks" / task.task_name
+            cache_dir = self.cache_dir / "tasks" / f"{task.task_name}_{uuid.uuid5(uuid.NAMESPACE_DNS, task_params)}"
             cache_dir.mkdir(parents=True, exist_ok=True)
         else:
             # Ensure the explicitly provided cache_dir exists
@@ -856,7 +862,7 @@ def set_task(
 
         task_df_path = Path(cache_dir) / "task_df.ld"
         samples_path = Path(cache_dir) / f"samples_{uuid.uuid4()}.ld"
-        
+
         task_df_path.mkdir(parents=True, exist_ok=True)
         samples_path.mkdir(parents=True, exist_ok=True)
 
diff --git a/pyhealth/datasets/chestxray14.py b/pyhealth/datasets/chestxray14.py
@@ -53,7 +53,8 @@ def __init__(self,
                  root: str = ".",
                  config_path: Optional[str] = str(Path(__file__).parent / "configs" / "chestxray14.yaml"),
                  download: bool = False,
-                 partial: bool = False) -> None:
+                 partial: bool = False,
+                 **kwargs) -> None:
         """Initializes the ChestX-ray14 dataset.
 
         Args:
@@ -87,6 +88,7 @@ def __init__(self,
             tables=["chestxray14"],
             dataset_name="ChestX-ray14",
             config_path=config_path,
+            **kwargs
         )
 
     @property
diff --git a/tests/core/test_caching.py b/tests/core/test_caching.py
@@ -3,9 +3,10 @@
 import shutil
 from pathlib import Path
 from unittest.mock import patch
-import polars as pl
 import dask.dataframe as dd
 import torch
+import json
+import uuid
 
 from tests.base import BaseTestCase
 from pyhealth.datasets.base_dataset import BaseDataset
@@ -15,12 +16,14 @@
 
 class MockTask(BaseTask):
     """Mock task for testing purposes."""
+    task_name = "test_task"
+    input_schema = {"test_attribute": "raw"}
+    output_schema = {"test_label": "binary"}
 
-    def __init__(self, task_name="test_task"):
-        self.task_name = task_name
-        self.input_schema = {"test_attribute": "raw"}
-        self.output_schema = {"test_label": "binary"}
+    def __init__(self, param=None):
         self.call_count = 0
+        if param:
+            self.param = param
 
     def __call__(self, patient):
         """Return mock samples based on patient data."""
@@ -77,20 +80,18 @@ def load_data(self) -> dd.DataFrame:
 class TestCachingFunctionality(BaseTestCase):
     """Test cases for caching functionality in BaseDataset.set_task()."""
 
+    @classmethod
+    def setUpClass(cls):
+        cls.temp_dir = tempfile.TemporaryDirectory()
+        cls.dataset = MockDataset(cache_dir=cls.temp_dir.name)
+
     def setUp(self):
-        """Set up test fixtures."""
-        self.temp_dir = Path(tempfile.mkdtemp())
-        self.dataset = MockDataset(cache_dir=self.temp_dir)
         self.task = MockTask()
+        self.cache_dir = Path(self.temp_dir.name) / "task_cache"
+        self.cache_dir.mkdir()
 
     def tearDown(self):
-        """Clean up test fixtures."""
-        shutil.rmtree(self.temp_dir, ignore_errors=True)
-
-    def _task_cache_dir(self) -> Path:
-        cache_dir = self.temp_dir / "task_cache"
-        cache_dir.mkdir(parents=True, exist_ok=True)
-        return cache_dir
+        shutil.rmtree(self.cache_dir)
 
     def test_set_task_signature(self):
         """Test that set_task has the correct method signature."""
@@ -120,9 +121,8 @@ def test_set_task_signature(self):
 
     def test_set_task_writes_cache_and_metadata(self):
         """Ensure set_task materializes cache files and schema metadata."""
-        cache_dir = self._task_cache_dir()
         with self.dataset.set_task(
-            self.task, cache_dir=cache_dir, cache_format="parquet"
+            self.task, cache_dir=self.cache_dir, cache_format="parquet"
         ) as sample_dataset:
             self.assertIsInstance(sample_dataset, SampleDataset)
             self.assertEqual(sample_dataset.dataset_name, "TestDataset")
@@ -131,7 +131,7 @@ def test_set_task_writes_cache_and_metadata(self):
             self.assertEqual(self.task.call_count, 2)
 
             # Ensure intermediate cache files are created
-            self.assertTrue((cache_dir / "task_df.ld" / "index.json").exists())
+            self.assertTrue((self.cache_dir / "task_df.ld" / "index.json").exists())
 
             # Cache artifacts should be present for StreamingDataset
             assert sample_dataset.input_dir.path is not None
@@ -156,35 +156,75 @@ def test_set_task_writes_cache_and_metadata(self):
         self.assertFalse((sample_dir / "index.json").exists())
         self.assertFalse((sample_dir / "schema.pkl").exists())
         # Ensure intermediate cache files are still present
-        self.assertTrue((cache_dir / "task_df.ld" / "index.json").exists())
+        self.assertTrue((self.cache_dir / "task_df.ld" / "index.json").exists())
 
 
     def test_default_cache_dir_is_used(self):
         """When cache_dir is omitted, default cache dir should be used."""
-        task_cache = self.dataset.cache_dir / "tasks" / self.task.task_name
+        task_params = json.dumps(
+            {"call_count": 0},
+            sort_keys=True,
+            default=str
+        )
+
+        task_cache = self.dataset.cache_dir / "tasks" / f"{self.task.task_name}_{uuid.uuid5(uuid.NAMESPACE_DNS, task_params)}"
         sample_dataset = self.dataset.set_task(self.task)
 
         self.assertTrue(task_cache.exists())
         self.assertTrue((task_cache / "task_df.ld" / "index.json").exists())
         self.assertTrue((self.dataset.cache_dir / "global_event_df.parquet").exists())
         self.assertEqual(len(sample_dataset), 4)
 
+        sample_dataset.close()
+
     def test_reuses_existing_cache_without_regeneration(self):
         """Second call should reuse cached samples instead of recomputing."""
-        cache_dir = self._task_cache_dir()
-        _ = self.dataset.set_task(self.task, cache_dir=cache_dir)
+        sample_dataset = self.dataset.set_task(self.task, cache_dir=self.cache_dir)
         self.assertEqual(self.task.call_count, 2)
 
         with patch.object(
             self.task, "__call__", side_effect=AssertionError("Task should not rerun")
         ):
             cached_dataset = self.dataset.set_task(
-                self.task, cache_dir=cache_dir, cache_format="parquet"
+                self.task, cache_dir=self.cache_dir, cache_format="parquet"
             )
 
         self.assertEqual(len(cached_dataset), 4)
         self.assertEqual(self.task.call_count, 2)
 
+        sample_dataset.close()
+        cached_dataset.close()
+
+    def test_tasks_with_diff_param_values_get_diff_caches(self):
+        sample_dataset1 = self.dataset.set_task(MockTask(param=1))
+        sample_dataset2 = self.dataset.set_task(MockTask(param=2))
+
+        task_params1 = json.dumps(
+            {"call_count": 0, "param": 2},
+            sort_keys=True,
+            default=str
+        )
+
+        task_params2 = json.dumps(
+            {"call_count": 0, "param": 2},
+            sort_keys=True,
+            default=str
+        )
+
+        task_cache1 = self.dataset.cache_dir / "tasks" / f"{self.task.task_name}_{uuid.uuid5(uuid.NAMESPACE_DNS, task_params1)}"
+        task_cache2 = self.dataset.cache_dir / "tasks" / f"{self.task.task_name}_{uuid.uuid5(uuid.NAMESPACE_DNS, task_params2)}"
+
+        self.assertTrue(task_cache1.exists())
+        self.assertTrue(task_cache2.exists())
+        self.assertTrue((task_cache1 / "task_df.ld" / "index.json").exists())
+        self.assertTrue((task_cache2 / "task_df.ld" / "index.json").exists())
+        self.assertTrue((self.dataset.cache_dir / "global_event_df.parquet").exists())
+        self.assertEqual(len(sample_dataset1), 4)
+        self.assertEqual(len(sample_dataset2), 4)
+
+        sample_dataset1.close()
+        sample_dataset2.close()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/core/test_chestxray14.py b/tests/core/test_chestxray14.py
diff --git a/tests/core/test_mimic3_readmission_prediction.py b/tests/core/test_mimic3_readmission_prediction.py