datakind
diff --git a/‎src/webapp/gcsutil.py‎
Lines changed: 22 additions & 11 deletions b/‎src/webapp/gcsutil.py‎
Lines changed: 22 additions & 11 deletions
diff --git a/‎src/webapp/gcsutil_test.py‎
Lines changed: 52 additions & 12 deletions b/‎src/webapp/gcsutil_test.py‎
Lines changed: 52 additions & 12 deletions
diff --git a/‎src/webapp/routers/data_test.py‎
Lines changed: 3 additions & 1 deletion b/‎src/webapp/routers/data_test.py‎
Lines changed: 3 additions & 1 deletion
@@ -5,6 +5,7 @@
 import logging
 from typing import Any, Dict, List, Optional
 
+import pandas as pd
 from pydantic import BaseModel
 from google.cloud import storage
 import google.auth
@@ -22,10 +23,10 @@
 
 
 def rename_file(
-    bucket_name,
-    file_name,
-    new_file_name,
-):
+    bucket_name: str,
+    file_name: str,
+    new_file_name: str,
+) -> None:
     """Moves a blob from one bucket to another with a new name."""
     storage_client = storage.Client()
     source_bucket = storage_client.bucket(bucket_name)
@@ -342,8 +343,8 @@ def validate_file(
             inst_schema: Optional extension schema with institutions.* blocks.
             institution_id: Key into inst_schema["institutions"]: "edvise", "pdp", or
                 institution UUID for custom. Default "pdp" for backward compatibility.
-            institution_identifier: Optional institution ID (e.g. UUID) for Edvise
-                normalization. Pass when institution_id == "edvise".
+            institution_identifier: Optional institution ID (e.g. UUID). Reserved for
+                future use; Edvise uses JSON-based validation only (different shape).
 
         Returns:
             List of inferred schema names (e.g. ["STUDENT"]).
@@ -386,21 +387,30 @@ def validate_file(
                 "cannot write validated output (e.g. empty schema list)."
             )
 
-        raw_blob_name = f"raw/{file_name}"
         validated_blob_name = f"validated/{file_name}"
         validated_blob = bucket.blob(validated_blob_name)
         if validated_blob.exists():
             raise ValueError(validated_blob_name + ": File already exists.")
 
+        self._archive_raw_and_write_validated(bucket, blob, file_name, normalized_df)
+        return inferred_schema_names
+
+    def _archive_raw_and_write_validated(
+        self,
+        bucket: Any,
+        blob: Any,
+        file_name: str,
+        normalized_df: pd.DataFrame,
+    ) -> None:
+        """Copy blob to raw/, write normalized DataFrame to validated/, delete from unvalidated/."""
+        raw_blob_name = f"raw/{file_name}"
+        validated_blob_name = f"validated/{file_name}"
         bucket.copy_blob(blob, bucket, raw_blob_name)
         logging.debug("Archived original to %s", raw_blob_name)
-
         self._write_dataframe_to_gcs_as_csv(bucket, validated_blob_name, normalized_df)
         logging.debug("Wrote normalized data to %s", validated_blob_name)
-
         blob.delete()
         logging.debug("Validation complete: validated=normalized, raw=archived")
-        return inferred_schema_names
 
     def _run_validation_and_get_normalized_df(
         self,
@@ -434,11 +444,12 @@ def _run_validation_and_get_normalized_df(
             logging.exception("Validation failed for %s: %s", file_name, e)
             raise
         except Exception as e:
+            # Log any other error with context before re-raising (no silent failures).
             logging.exception("Validation failed for %s: %s", file_name, e)
             raise
 
     def _write_dataframe_to_gcs_as_csv(
-        self, bucket: Any, blob_name: str, normalized_df: Any
+        self, bucket: Any, blob_name: str, normalized_df: pd.DataFrame
     ) -> None:
         """Write a DataFrame to GCS as UTF-8 CSV. Used for validated/ output."""
         csv_buffer = io.StringIO()
 
@@ -16,7 +16,7 @@
 # --------------------------------------------------------------------------- #
 
 
-def test_validate_file_raises_on_empty_file_name():
+def test_validate_file_raises_on_empty_file_name() -> None:
     """Rejects empty file_name with clear ValueError."""
     control = StorageControl()
     with pytest.raises(ValueError, match="file_name is required and must be non-empty"):
@@ -28,7 +28,7 @@ def test_validate_file_raises_on_empty_file_name():
         )
 
 
-def test_validate_file_raises_on_whitespace_only_file_name():
+def test_validate_file_raises_on_whitespace_only_file_name() -> None:
     """Rejects whitespace-only file_name."""
     control = StorageControl()
     with pytest.raises(ValueError, match="file_name is required and must be non-empty"):
@@ -40,7 +40,7 @@ def test_validate_file_raises_on_whitespace_only_file_name():
         )
 
 
-def test_validate_file_raises_on_file_name_with_slash():
+def test_validate_file_raises_on_file_name_with_slash() -> None:
     """Rejects file_name containing '/'."""
     control = StorageControl()
     with pytest.raises(ValueError, match="file_name must not contain"):
@@ -52,7 +52,7 @@ def test_validate_file_raises_on_file_name_with_slash():
         )
 
 
-def test_validate_file_raises_on_empty_allowed_schemas():
+def test_validate_file_raises_on_empty_allowed_schemas() -> None:
     """Rejects empty allowed_schemas."""
     control = StorageControl()
     with pytest.raises(ValueError, match="allowed_schemas must not be empty"):
@@ -69,7 +69,7 @@ def test_validate_file_raises_on_empty_allowed_schemas():
 # --------------------------------------------------------------------------- #
 
 
-def test_validate_file_raises_when_unvalidated_blob_not_found():
+def test_validate_file_raises_when_unvalidated_blob_not_found() -> None:
     """Raises ValueError with clear message when file not in unvalidated/."""
     mock_bucket = MagicMock()
     mock_blob = MagicMock()
@@ -90,7 +90,7 @@ def test_validate_file_raises_when_unvalidated_blob_not_found():
             )
 
 
-def test_validate_file_raises_when_normalized_df_none():
+def test_validate_file_raises_when_normalized_df_none() -> None:
     """Raises ValueError when validation returns normalized_df None (e.g. empty schema)."""
     mock_bucket = MagicMock()
     mock_blob = MagicMock()
@@ -119,7 +119,7 @@ def test_validate_file_raises_when_normalized_df_none():
                 )
 
 
-def test_validate_file_raises_when_validated_blob_already_exists():
+def test_validate_file_raises_when_validated_blob_already_exists() -> None:
     """Raises ValueError when validated/{file_name} already exists."""
     mock_bucket = MagicMock()
     mock_unvalidated_blob = MagicMock()
@@ -161,7 +161,9 @@ def blob_side_effect(name: str) -> Any:
 # --------------------------------------------------------------------------- #
 
 
-def test_validate_file_success_archives_raw_writes_validated_deletes_unvalidated():
+def test_validate_file_success_archives_raw_writes_validated_deletes_unvalidated() -> (
+    None
+):
     """On success: copies to raw/, writes normalized CSV to validated/, deletes unvalidated/."""
     mock_bucket = MagicMock()
     mock_unvalidated_blob = MagicMock()
@@ -215,7 +217,7 @@ def blob_side_effect(name: str) -> Any:
 # --------------------------------------------------------------------------- #
 
 
-def test_validate_file_propagates_hard_validation_error():
+def test_validate_file_propagates_hard_validation_error() -> None:
     """HardValidationError from validation is not wrapped and propagates."""
     mock_bucket = MagicMock()
     mock_blob = MagicMock()
@@ -246,7 +248,7 @@ def test_validate_file_propagates_hard_validation_error():
 # --------------------------------------------------------------------------- #
 
 
-def test_run_validation_and_get_normalized_df_returns_names_and_df():
+def test_run_validation_and_get_normalized_df_returns_names_and_df() -> None:
     """Returns (inferred_schema_names, normalized_df) when validation succeeds."""
     mock_blob = MagicMock()
     mock_file = io.StringIO("foo_col,bar_col\n1,a\n2,b\n")
@@ -274,7 +276,9 @@ def test_run_validation_and_get_normalized_df_returns_names_and_df():
     assert list(df.columns) == ["x"]
 
 
-def test_run_validation_and_get_normalized_df_propagates_hard_validation_error():
+def test_run_validation_and_get_normalized_df_propagates_hard_validation_error() -> (
+    None
+):
     """HardValidationError is re-raised without wrapping."""
     mock_blob = MagicMock()
     mock_file = io.StringIO("bad")
@@ -291,12 +295,48 @@ def test_run_validation_and_get_normalized_df_propagates_hard_validation_error()
             )
 
 
+def test_run_validation_and_get_normalized_df_propagates_value_error() -> None:
+    """ValueError from validate_file_reader (e.g. encoding) is re-raised."""
+    mock_blob = MagicMock()
+    mock_file = io.StringIO("data")
+    mock_blob.open.return_value.__enter__ = lambda self: mock_file
+    mock_blob.open.return_value.__exit__ = lambda self, *args: None
+
+    control = StorageControl()
+    with patch(
+        "src.webapp.gcsutil.validate_file_reader",
+        side_effect=ValueError("Invalid file format"),
+    ):
+        with pytest.raises(ValueError, match="Invalid file format"):
+            control._run_validation_and_get_normalized_df(
+                mock_blob, "f.csv", ["STUDENT"], {}, None, "pdp", None
+            )
+
+
+def test_run_validation_and_get_normalized_df_propagates_unicode_error() -> None:
+    """UnicodeError from validate_file_reader (e.g. decode) is re-raised."""
+    mock_blob = MagicMock()
+    mock_file = io.StringIO("data")
+    mock_blob.open.return_value.__enter__ = lambda self: mock_file
+    mock_blob.open.return_value.__exit__ = lambda self, *args: None
+
+    control = StorageControl()
+    with patch(
+        "src.webapp.gcsutil.validate_file_reader",
+        side_effect=UnicodeDecodeError("utf-8", b"x", 0, 1, "invalid"),
+    ):
+        with pytest.raises(UnicodeDecodeError):
+            control._run_validation_and_get_normalized_df(
+                mock_blob, "f.csv", ["STUDENT"], {}, None, "pdp", None
+            )
+
+
 # --------------------------------------------------------------------------- #
 # _write_dataframe_to_gcs_as_csv
 # --------------------------------------------------------------------------- #
 
 
-def test_write_dataframe_to_gcs_as_csv_uploads_utf8_csv():
+def test_write_dataframe_to_gcs_as_csv_uploads_utf8_csv() -> None:
     """Writes DataFrame as UTF-8 CSV with correct content_type."""
     mock_blob = MagicMock()
     mock_bucket = MagicMock()
 
@@ -1022,8 +1022,10 @@ def test_validate_file_with_edvise_schema(edvise_client: TestClient) -> None:
     assert response.json()["inst_id"] == uuid_to_str(EDVISE_INST_UUID)
     assert response.json()["source"] == "MANUAL_UPLOAD"
 
-    # Verify that validate_file was called (Edvise schema was used)
+    # Verify that validate_file was called with institution_identifier for Edvise
     assert MOCK_STORAGE.validate_file.called
+    call_kwargs = MOCK_STORAGE.validate_file.call_args.kwargs
+    assert call_kwargs.get("institution_identifier") == uuid_to_str(EDVISE_INST_UUID)
 
 
 def test_validation_helper_edvise_schema_not_found(