forked from advanced-computing/Haixin-and-Chengpu-API
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_quality.py
More file actions
26 lines (19 loc) · 817 Bytes
/
data_quality.py
File metadata and controls
26 lines (19 loc) · 817 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
"""
Data quality check functions for NYC Film Permits dataset.
Each function takes a DataFrame and returns True if the check passes.
"""
import pandas as pd
def check_no_duplicate_event_ids(df):
"""Check that EventID is unique — no duplicate rows."""
return df["EventID"].duplicated().sum() == 0
def check_borough_values_valid(df):
"""Check that Borough only contains the 5 valid NYC boroughs."""
valid = {"Manhattan", "Brooklyn", "Queens", "Bronx", "Staten Island"}
actual = set(df["Borough"].dropna().unique())
return actual.issubset(valid)
def check_missing_rate_below_threshold(df, threshold=0.15):
"""Check that no column has more than 15% missing values."""
for col in df.columns:
if df[col].isnull().mean() > threshold:
return False
return True