Skip to content

Commit 84f2ea2

Browse files
refactor(validation): align with universal principles, add tests, fix types and format
- Extract validation helpers to meet 50-line rule (_header_missing_and_extra, _get_csv_read_kwargs, _validate_optional_columns_json) - Extract gcsutil._archive_raw_and_write_validated; add type hints to rename_file - Add tests: PDP rename/validate_dataframe, CSV read failure, gcsutil error propagation, edvise institution_identifier in validate_file call - Remove unused validation_edvise_normalize and its tests - Fix mypy in validation_pdp_edvise and tests (Optional[List], cast, annotations) - Apply ruff format Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent a33c311 commit 84f2ea2

File tree

9 files changed

+535
-854
lines changed

9 files changed

+535
-854
lines changed

src/webapp/gcsutil.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging
66
from typing import Any, Dict, List, Optional
77

8+
import pandas as pd
89
from pydantic import BaseModel
910
from google.cloud import storage
1011
import google.auth
@@ -22,10 +23,10 @@
2223

2324

2425
def rename_file(
25-
bucket_name,
26-
file_name,
27-
new_file_name,
28-
):
26+
bucket_name: str,
27+
file_name: str,
28+
new_file_name: str,
29+
) -> None:
2930
"""Moves a blob from one bucket to another with a new name."""
3031
storage_client = storage.Client()
3132
source_bucket = storage_client.bucket(bucket_name)
@@ -342,8 +343,8 @@ def validate_file(
342343
inst_schema: Optional extension schema with institutions.* blocks.
343344
institution_id: Key into inst_schema["institutions"]: "edvise", "pdp", or
344345
institution UUID for custom. Default "pdp" for backward compatibility.
345-
institution_identifier: Optional institution ID (e.g. UUID) for Edvise
346-
normalization. Pass when institution_id == "edvise".
346+
institution_identifier: Optional institution ID (e.g. UUID). Reserved for
347+
future use; Edvise uses JSON-based validation only (different shape).
347348
348349
Returns:
349350
List of inferred schema names (e.g. ["STUDENT"]).
@@ -386,21 +387,30 @@ def validate_file(
386387
"cannot write validated output (e.g. empty schema list)."
387388
)
388389

389-
raw_blob_name = f"raw/{file_name}"
390390
validated_blob_name = f"validated/{file_name}"
391391
validated_blob = bucket.blob(validated_blob_name)
392392
if validated_blob.exists():
393393
raise ValueError(validated_blob_name + ": File already exists.")
394394

395+
self._archive_raw_and_write_validated(bucket, blob, file_name, normalized_df)
396+
return inferred_schema_names
397+
398+
def _archive_raw_and_write_validated(
399+
self,
400+
bucket: Any,
401+
blob: Any,
402+
file_name: str,
403+
normalized_df: pd.DataFrame,
404+
) -> None:
405+
"""Copy blob to raw/, write normalized DataFrame to validated/, delete from unvalidated/."""
406+
raw_blob_name = f"raw/{file_name}"
407+
validated_blob_name = f"validated/{file_name}"
395408
bucket.copy_blob(blob, bucket, raw_blob_name)
396409
logging.debug("Archived original to %s", raw_blob_name)
397-
398410
self._write_dataframe_to_gcs_as_csv(bucket, validated_blob_name, normalized_df)
399411
logging.debug("Wrote normalized data to %s", validated_blob_name)
400-
401412
blob.delete()
402413
logging.debug("Validation complete: validated=normalized, raw=archived")
403-
return inferred_schema_names
404414

405415
def _run_validation_and_get_normalized_df(
406416
self,
@@ -434,11 +444,12 @@ def _run_validation_and_get_normalized_df(
434444
logging.exception("Validation failed for %s: %s", file_name, e)
435445
raise
436446
except Exception as e:
447+
# Log any other error with context before re-raising (no silent failures).
437448
logging.exception("Validation failed for %s: %s", file_name, e)
438449
raise
439450

440451
def _write_dataframe_to_gcs_as_csv(
441-
self, bucket: Any, blob_name: str, normalized_df: Any
452+
self, bucket: Any, blob_name: str, normalized_df: pd.DataFrame
442453
) -> None:
443454
"""Write a DataFrame to GCS as UTF-8 CSV. Used for validated/ output."""
444455
csv_buffer = io.StringIO()

src/webapp/gcsutil_test.py

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# --------------------------------------------------------------------------- #
1717

1818

19-
def test_validate_file_raises_on_empty_file_name():
19+
def test_validate_file_raises_on_empty_file_name() -> None:
2020
"""Rejects empty file_name with clear ValueError."""
2121
control = StorageControl()
2222
with pytest.raises(ValueError, match="file_name is required and must be non-empty"):
@@ -28,7 +28,7 @@ def test_validate_file_raises_on_empty_file_name():
2828
)
2929

3030

31-
def test_validate_file_raises_on_whitespace_only_file_name():
31+
def test_validate_file_raises_on_whitespace_only_file_name() -> None:
3232
"""Rejects whitespace-only file_name."""
3333
control = StorageControl()
3434
with pytest.raises(ValueError, match="file_name is required and must be non-empty"):
@@ -40,7 +40,7 @@ def test_validate_file_raises_on_whitespace_only_file_name():
4040
)
4141

4242

43-
def test_validate_file_raises_on_file_name_with_slash():
43+
def test_validate_file_raises_on_file_name_with_slash() -> None:
4444
"""Rejects file_name containing '/'."""
4545
control = StorageControl()
4646
with pytest.raises(ValueError, match="file_name must not contain"):
@@ -52,7 +52,7 @@ def test_validate_file_raises_on_file_name_with_slash():
5252
)
5353

5454

55-
def test_validate_file_raises_on_empty_allowed_schemas():
55+
def test_validate_file_raises_on_empty_allowed_schemas() -> None:
5656
"""Rejects empty allowed_schemas."""
5757
control = StorageControl()
5858
with pytest.raises(ValueError, match="allowed_schemas must not be empty"):
@@ -69,7 +69,7 @@ def test_validate_file_raises_on_empty_allowed_schemas():
6969
# --------------------------------------------------------------------------- #
7070

7171

72-
def test_validate_file_raises_when_unvalidated_blob_not_found():
72+
def test_validate_file_raises_when_unvalidated_blob_not_found() -> None:
7373
"""Raises ValueError with clear message when file not in unvalidated/."""
7474
mock_bucket = MagicMock()
7575
mock_blob = MagicMock()
@@ -90,7 +90,7 @@ def test_validate_file_raises_when_unvalidated_blob_not_found():
9090
)
9191

9292

93-
def test_validate_file_raises_when_normalized_df_none():
93+
def test_validate_file_raises_when_normalized_df_none() -> None:
9494
"""Raises ValueError when validation returns normalized_df None (e.g. empty schema)."""
9595
mock_bucket = MagicMock()
9696
mock_blob = MagicMock()
@@ -119,7 +119,7 @@ def test_validate_file_raises_when_normalized_df_none():
119119
)
120120

121121

122-
def test_validate_file_raises_when_validated_blob_already_exists():
122+
def test_validate_file_raises_when_validated_blob_already_exists() -> None:
123123
"""Raises ValueError when validated/{file_name} already exists."""
124124
mock_bucket = MagicMock()
125125
mock_unvalidated_blob = MagicMock()
@@ -161,7 +161,9 @@ def blob_side_effect(name: str) -> Any:
161161
# --------------------------------------------------------------------------- #
162162

163163

164-
def test_validate_file_success_archives_raw_writes_validated_deletes_unvalidated():
164+
def test_validate_file_success_archives_raw_writes_validated_deletes_unvalidated() -> (
165+
None
166+
):
165167
"""On success: copies to raw/, writes normalized CSV to validated/, deletes unvalidated/."""
166168
mock_bucket = MagicMock()
167169
mock_unvalidated_blob = MagicMock()
@@ -215,7 +217,7 @@ def blob_side_effect(name: str) -> Any:
215217
# --------------------------------------------------------------------------- #
216218

217219

218-
def test_validate_file_propagates_hard_validation_error():
220+
def test_validate_file_propagates_hard_validation_error() -> None:
219221
"""HardValidationError from validation is not wrapped and propagates."""
220222
mock_bucket = MagicMock()
221223
mock_blob = MagicMock()
@@ -246,7 +248,7 @@ def test_validate_file_propagates_hard_validation_error():
246248
# --------------------------------------------------------------------------- #
247249

248250

249-
def test_run_validation_and_get_normalized_df_returns_names_and_df():
251+
def test_run_validation_and_get_normalized_df_returns_names_and_df() -> None:
250252
"""Returns (inferred_schema_names, normalized_df) when validation succeeds."""
251253
mock_blob = MagicMock()
252254
mock_file = io.StringIO("foo_col,bar_col\n1,a\n2,b\n")
@@ -274,7 +276,9 @@ def test_run_validation_and_get_normalized_df_returns_names_and_df():
274276
assert list(df.columns) == ["x"]
275277

276278

277-
def test_run_validation_and_get_normalized_df_propagates_hard_validation_error():
279+
def test_run_validation_and_get_normalized_df_propagates_hard_validation_error() -> (
280+
None
281+
):
278282
"""HardValidationError is re-raised without wrapping."""
279283
mock_blob = MagicMock()
280284
mock_file = io.StringIO("bad")
@@ -291,12 +295,48 @@ def test_run_validation_and_get_normalized_df_propagates_hard_validation_error()
291295
)
292296

293297

298+
def test_run_validation_and_get_normalized_df_propagates_value_error() -> None:
299+
"""ValueError from validate_file_reader (e.g. encoding) is re-raised."""
300+
mock_blob = MagicMock()
301+
mock_file = io.StringIO("data")
302+
mock_blob.open.return_value.__enter__ = lambda self: mock_file
303+
mock_blob.open.return_value.__exit__ = lambda self, *args: None
304+
305+
control = StorageControl()
306+
with patch(
307+
"src.webapp.gcsutil.validate_file_reader",
308+
side_effect=ValueError("Invalid file format"),
309+
):
310+
with pytest.raises(ValueError, match="Invalid file format"):
311+
control._run_validation_and_get_normalized_df(
312+
mock_blob, "f.csv", ["STUDENT"], {}, None, "pdp", None
313+
)
314+
315+
316+
def test_run_validation_and_get_normalized_df_propagates_unicode_error() -> None:
317+
"""UnicodeError from validate_file_reader (e.g. decode) is re-raised."""
318+
mock_blob = MagicMock()
319+
mock_file = io.StringIO("data")
320+
mock_blob.open.return_value.__enter__ = lambda self: mock_file
321+
mock_blob.open.return_value.__exit__ = lambda self, *args: None
322+
323+
control = StorageControl()
324+
with patch(
325+
"src.webapp.gcsutil.validate_file_reader",
326+
side_effect=UnicodeDecodeError("utf-8", b"x", 0, 1, "invalid"),
327+
):
328+
with pytest.raises(UnicodeDecodeError):
329+
control._run_validation_and_get_normalized_df(
330+
mock_blob, "f.csv", ["STUDENT"], {}, None, "pdp", None
331+
)
332+
333+
294334
# --------------------------------------------------------------------------- #
295335
# _write_dataframe_to_gcs_as_csv
296336
# --------------------------------------------------------------------------- #
297337

298338

299-
def test_write_dataframe_to_gcs_as_csv_uploads_utf8_csv():
339+
def test_write_dataframe_to_gcs_as_csv_uploads_utf8_csv() -> None:
300340
"""Writes DataFrame as UTF-8 CSV with correct content_type."""
301341
mock_blob = MagicMock()
302342
mock_bucket = MagicMock()

src/webapp/routers/data_test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1022,8 +1022,10 @@ def test_validate_file_with_edvise_schema(edvise_client: TestClient) -> None:
10221022
assert response.json()["inst_id"] == uuid_to_str(EDVISE_INST_UUID)
10231023
assert response.json()["source"] == "MANUAL_UPLOAD"
10241024

1025-
# Verify that validate_file was called (Edvise schema was used)
1025+
# Verify that validate_file was called with institution_identifier for Edvise
10261026
assert MOCK_STORAGE.validate_file.called
1027+
call_kwargs = MOCK_STORAGE.validate_file.call_args.kwargs
1028+
assert call_kwargs.get("institution_identifier") == uuid_to_str(EDVISE_INST_UUID)
10271029

10281030

10291031
def test_validation_helper_edvise_schema_not_found(

0 commit comments

Comments
 (0)