Skip to content

Commit 822b740

Browse files
authored
Merge pull request #196 from datakind/feat/EdviseSchema
Feat: Add Edvise Schema Validation Support with Human-Readable Error Messages
2 parents 9bb2895 + 77e1b5b commit 822b740

28 files changed

+6280
-71
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,6 @@ dmypy.json
114114
# terraform
115115
**/.terraform/*
116116
**/terraform.tfvars
117+
118+
# Cursor rule files
119+
.cursor/

pyproject.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,11 @@ lines-after-imports = 1
8686
[tool.pytest.ini_options]
8787
minversion = "8.0"
8888
addopts = ["--verbose", "--import-mode=importlib"]
89-
filterwarnings = ["ignore::DeprecationWarning"]
90-
testpaths = ["tests"]
89+
filterwarnings = [
90+
"ignore::DeprecationWarning",
91+
"ignore::FutureWarning:pandera",
92+
]
93+
testpaths = ["src"]
9194

9295
[tool.mypy]
9396
files = ["src"]

src/webapp/database.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
Integer,
2323
BigInteger,
2424
Index,
25+
CheckConstraint,
2526
event,
2627
)
2728
from sqlalchemy.orm import (
@@ -273,6 +274,10 @@ class InstTable(Base):
273274
state: Mapped[str | None] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True)
274275
# Only populated for PDP schools.
275276
pdp_id: Mapped[str | None] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True)
277+
# Only populated for Edvise schools.
278+
edvise_id: Mapped[str | None] = mapped_column(
279+
String(VAR_CHAR_LENGTH), nullable=True
280+
)
276281
created_at: Mapped[datetime.datetime] = mapped_column(
277282
DateTime(timezone=True),
278283
server_default=func.now(),
@@ -672,9 +677,10 @@ class DocType(enum.Enum):
672677
class SchemaRegistryTable(Base):
673678
"""
674679
Stores versioned schema documents:
675-
- Base schema (doc_type=base, is_pdp=False, inst_id NULL)
676-
- PDP shared extension (doc_type=extension, is_pdp=True, inst_id NULL)
677-
- Custom institution extension (doc_type=extension, is_pdp=False, inst_id=<UUID>)
680+
- Base schema (doc_type=base, is_pdp=False, is_edvise=False, inst_id NULL)
681+
- PDP shared extension (doc_type=extension, is_pdp=True, is_edvise=False, inst_id NULL)
682+
- Edvise shared extension (doc_type=extension, is_pdp=False, is_edvise=True, inst_id NULL)
683+
- Custom institution extension (doc_type=extension, is_pdp=False, is_edvise=False, inst_id=<UUID>)
678684
Layers can reference a parent (extends_schema_id) that they extend.
679685
"""
680686

@@ -685,11 +691,12 @@ class SchemaRegistryTable(Base):
685691
doc_type: Mapped[DocType] = mapped_column(
686692
Enum(DocType, native_enum=False), nullable=False
687693
)
688-
# Nullable: NULL for base and PDP shared extension
694+
# Nullable: NULL for base, PDP shared extension, and Edvise shared extension
689695
inst_id: Mapped[uuid.UUID | None] = mapped_column(
690696
ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=True
691697
)
692698
is_pdp: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
699+
is_edvise: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
693700
version_label: Mapped[str] = mapped_column(
694701
String(VAR_CHAR_STANDARD_LENGTH), nullable=False
695702
)
@@ -734,8 +741,12 @@ class SchemaRegistryTable(Base):
734741
UniqueConstraint("doc_type", "version_label", name="uq_base_version"),
735742
UniqueConstraint("is_pdp", "version_label", name="uq_pdp_version"),
736743
UniqueConstraint("inst_id", "version_label", name="uq_inst_version"),
744+
CheckConstraint(
745+
"NOT (is_pdp = 1 AND is_edvise = 1)", name="ck_no_pdp_and_edvise"
746+
),
737747
Index("idx_schema_active_base", "doc_type", "is_active"),
738748
Index("idx_schema_active_pdp", "is_pdp", "is_active"),
749+
Index("idx_schema_active_edvise", "is_edvise", "is_active"),
739750
Index("idx_schema_active_inst", "inst_id", "is_active"),
740751
)
741752

@@ -746,6 +757,8 @@ def namespace(self) -> str:
746757
return "base"
747758
if self.is_pdp:
748759
return "pdp"
760+
if self.is_edvise:
761+
return "edvise"
749762
if self.inst_id:
750763
return f"inst:{self.inst_id}"
751764
return "unknown"

src/webapp/databricks.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from .validation_extension import generate_extension_schema
1717
from .config import databricks_vars, gcs_vars
1818
from .utilities import databricksify_inst_name, SchemaType
19-
from typing import List, Any, Dict, IO, cast, Optional
19+
from typing import List, Any, Dict, Optional
2020
from fastapi import HTTPException
2121
import requests
2222
import hashlib
@@ -25,13 +25,7 @@
2525
from cachetools import TTLCache
2626
import threading
2727
import re
28-
29-
try:
30-
import tomllib as _toml # Py 3.11+
31-
except ModuleNotFoundError:
32-
import tomli as _toml # Py ≤ 3.10
3328
import pandas as pd
34-
import re
3529

3630
# Setting up logger
3731
LOGGER = logging.getLogger(__name__)
@@ -567,7 +561,7 @@ def get_key_for_file(
567561
Case-insensitive match of file_name against mapping values.
568562
Values may be:
569563
- str literal (e.g., "student.csv") → allow optional base suffixes before the ext.
570-
- str regex (e.g., r"^course_.*\.csv$") → re.IGNORECASE fullmatch.
564+
- str regex (e.g., r"^course_.*\\.csv$") → re.IGNORECASE fullmatch.
571565
- compiled regex (re.Pattern) → fullmatch, adding IGNORECASE if missing.
572566
- list of any of the above.
573567
"""
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Missing required columns: 'Student ID' (Student identifier). These columns must be present in your file.
2+
3+
Unexpected columns found: 'Extra Col'. Please remove these columns or rename them to match the expected schema.
4+
5+
Column 'Age' has validation errors:
6+
• Row 2: Value validation failed. Current value: found 'None'
7+
8+
Column 'Grade' has validation errors:
9+
• Row 1: Value must be one of: A, B, C. Current value: found 'X'
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Unexpected columns found: 'Student ID (and 2 similar)'. Please remove these columns or rename them to match the expected schema.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Missing required columns: 'Student ID' (Unique student identifier), 'Grade' (Student grade (A-F)), 'Age' (Student age). These columns must be present in your file.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
Column 'Age' has validation errors:
2+
• Row 2: Value validation failed. Current value: found 'None'
3+
4+
Column 'Grade' has validation errors:
5+
• Row 1: Value must be one of: A, B, C, D, F. Current value: found 'X'
6+
7+
Column 'Score' has validation errors:
8+
• Row 3: Validation failed for greater_than(0) check. Current value: found '-5'
9+
10+
Column 'Student ID' has validation errors:
11+
• Row 1: Value must be at least 3 characters long. Current value: found '****' (value masked for privacy)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Column 'Grade' has validation errors:
2+
• Row 1: Value must be one of: A, B, C, D, F. Current value: found 'X'
3+
• Row 2: Value must be one of: A, B, C, D, F. Current value: found 'Y'
4+
• Row 3: Value must be one of: A, B, C, D, F. Current value: found 'Z'
5+
• Row 4: Value must be one of: A, B, C, D, F. Current value: found 'W'
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
Column 'Course Name' has validation errors:
2+
• Row 1: Value must be at least 3 characters long. Current value: found 'XY'
3+
4+
Column 'Email' has validation errors:
5+
• Row 2: Value must be at least 5 characters long. Current value: found 'ca******om' (value masked for privacy)
6+
7+
Column 'Grade' has validation errors:
8+
• Row 2: Value must be one of: A, B, C. Current value: found 'X'
9+
10+
Column 'SSN' has validation errors:
11+
• Row 3: Value must be at least 9 characters long. Current value: found 'CA******89' (value masked for privacy)
12+
13+
Column 'Student Name' has validation errors:
14+
• Row 1: Value must be at least 3 characters long. Current value: found 'CA******23' (value masked for privacy)

0 commit comments

Comments
 (0)