Merge pull request #41 from jdkent/enh/only_post_process

jdkent · web-flow · commit 7a1921bd3c4b · 2025-06-11T14:07:21.000-05:00
Enh/only post process
diff --git a/ns_extract/cli/run.py b/ns_extract/cli/run.py
@@ -129,11 +129,7 @@ def run_pipelines(
 
             # Run pipeline
             pipeline_output_dir = output_path / pipeline_name
-            pipeline.transform_dataset(
-                dataset,
-                pipeline_output_dir,
-                **transform_args
-            )
+            pipeline.transform_dataset(dataset, pipeline_output_dir, **transform_args)
 
             print(f"Completed {pipeline_name} pipeline")
 
diff --git a/ns_extract/pipelines/base.py b/ns_extract/pipelines/base.py
@@ -178,26 +178,39 @@ def transform_dataset(
                     raw_file = study_dir / "raw_results.json"
                     results_file = study_dir / "results.json"
 
-                    # Try raw_results.json first
-                    if raw_file.exists():
-                        try:
-                            with raw_file.open() as f:
-                                raw_results[dbid] = json.load(f)
-                        except (IOError, json.JSONDecodeError) as e:
-                            raise ProcessingError(
-                                dbid, f"Failed to load raw results: {e}"
+                    try:
+                        # Try raw_results.json first
+                        if raw_file.exists():
+                            try:
+                                with raw_file.open() as f:
+                                    raw_results[dbid] = json.load(f)
+                            except (IOError, json.JSONDecodeError) as e:
+                                logger.error(
+                                    f"Failed to load raw results for study {dbid}: {e}"
+                                )
+                                continue
+                        # Fallback to results.json
+                        elif results_file.exists():
+                            try:
+                                with results_file.open() as f:
+                                    raw_results[dbid] = json.load(f)
+                            except (IOError, json.JSONDecodeError) as e:
+                                logger.error(
+                                    f"Failed to load results for study {dbid}: {e}"
+                                )
+                                continue
+                        elif post_process == "only":
+                            logger.warning(
+                                f"Skipping study {dbid}: no results found for post-processing"
                             )
-                    # Fallback to results.json
-                    elif results_file.exists():
-                        try:
-                            with results_file.open() as f:
-                                raw_results[dbid] = json.load(f)
-                        except (IOError, json.JSONDecodeError) as e:
-                            raise ProcessingError(dbid, f"Failed to load results: {e}")
-                    elif post_process == "only":
-                        raise ProcessingError(
-                            dbid, "No results found for post-processing"
-                        )
+                            continue
+                    except Exception as e:
+                        logger.error(f"Error processing study {dbid}: {e}")
+                        continue
+
+                if not raw_results and post_process == "only":
+                    logger.warning("No results found for post-processing in any study")
+                    return hash_outdir
 
                 kwargs["raw_results"] = raw_results
 
diff --git a/tests/test_example_extractor.py b/tests/test_example_extractor.py
@@ -392,6 +392,42 @@ def test_text_and_demographics_update(sample_data, mock_demographics, tmp_path):
     assert "modified_text.txt" in str(second_run_info["inputs"])
 
 
+def test_post_process_only_missing_results(sample_data, mock_demographics, tmp_path):
+    """Test post_process='only' gracefully handles missing results."""
+    demographics_dir = setup_demographics_dir(tmp_path, mock_demographics)
+    
+    # Create test data with clear transformation differences
+    test_study_id = list(mock_demographics.keys())[0]
+    modified_dataset = sample_data.slice([test_study_id])
+
+    # Set up pipeline without creating any results
+    extractor = ExampleExtractor()
+    input_pipeline_info = {
+        "participant_demographics": {
+            "version": "1.0.0",
+            "config_hash": "abc123",
+            "pipeline_dir": Path(demographics_dir),
+        }
+    }
+    
+    # Create output dir but don't run pipeline
+    output_dir = tmp_path / "output"
+    output_dir.mkdir(parents=True)
+    
+    # Try post-process only - should log warning but not error
+    hash_dir = extractor.transform_dataset(
+        modified_dataset,
+        output_dir,
+        post_process="only",
+        input_pipeline_info=input_pipeline_info
+    )
+    
+    # Check directory exists but no results were created
+    study_dir = hash_dir / test_study_id
+    assert not (study_dir / "results.json").exists(), "No results should be created"
+    assert not (study_dir / "raw_results.json").exists(), "No raw results should be created"
+
+
 def test_post_process_and_file_handling(sample_data, mock_demographics, tmp_path):
     """Test post-processing modes and file handling behavior."""
     demographics_dir = setup_demographics_dir(tmp_path, mock_demographics)