22import logging
33import time
44from abc import ABC
5- from typing import Callable , List
5+ from dataclasses import dataclass
6+ from typing import Callable , List , Optional
67
78import geopandas as gpd
9+ import pandas as pd
810import pandera .pandas as pa
11+ from pandera import Check
912
1013from src .config .config import USE_CRS
1114from src .constants .city_limits import PHL_GEOMETRY
@@ -23,15 +26,16 @@ def __bool__(self):
2326class BaseValidator (ABC ):
2427 """Base class for service-specific data validation."""
2528
26- schema : pa .DataFrameModel = None
29+ schema : pa .DataFrameSchema = None
2730
2831 def __init_subclass__ (cls ):
2932 schema = getattr (cls , "schema" , None )
30- if schema is not None and (
31- not isinstance (schema , type ) or not isinstance (schema , pa .DataFrameModel )
32- ):
33+ if schema is not None and not isinstance (schema , pa .DataFrameSchema ):
34+ print (type (schema ))
35+ print (isinstance (schema , type ))
36+ print (isinstance (schema , pa .DataFrameSchema ))
3337 raise TypeError (
34- f"{ cls .__name__ } must define a 'schema' class variable that is a subclass of pandera.SchemaModel ."
38+ f"{ cls .__name__ } must define a 'schema' class variable that is an instance of pandera.DataFrameSchema ."
3539 )
3640 return super ().__init_subclass__ ()
3741
@@ -197,9 +201,9 @@ def validate(self, gdf: gpd.GeoDataFrame) -> ValidationResult:
197201 schema_start = time .time ()
198202 if self .schema :
199203 try :
200- self .schema .validate (gdf , lazy_validation = True )
204+ self .schema .validate (gdf , lazy = True )
201205 except pa .errors .SchemaErrors as err :
202- self .errors .append (err .failure_case )
206+ self .errors .append (err .failure_cases )
203207 schema_time = time .time () - schema_start
204208
205209 # Custom validation
@@ -258,3 +262,93 @@ def wrapper(gdf: gpd.GeoDataFrame = None, *args, **kwargs):
258262 return wrapper
259263
260264 return decorator
265+
266+
267+ no_na_check = Check .ne ("NA" , error = "Value cannot be NA" )
268+
269+ unique_check = Check (lambda s : s .is_unique , error = "Should have all unique values" )
270+
271+
272+ def unique_value_check (lower : int , upper : int ) -> Check :
273+ return Check (
274+ lambda s : s .nunique () >= lower and s .nunique () < upper ,
275+ error = f"Number of unique values is roughly between { lower } and { upper } " ,
276+ )
277+
278+
279+ def null_percentage_check (null_percent : float ) -> Check :
280+ return Check (
281+ lambda s : s .isnull ().mean () >= 0.8 * null_percent
282+ and s .isnull ().mean () <= 1.2 * null_percent ,
283+ error = f"Percentage of nulls in column should be roughly { null_percent } " ,
284+ )
285+
286+
287+ @dataclass
288+ class DistributionParams :
289+ min_value : Optional [int | float ] = None
290+ max_value : Optional [int | float ] = None
291+ mean : Optional [int | float ] = None
292+ median : Optional [int | float ] = None
293+ std : Optional [int | float ] = None
294+ q1 : Optional [int | float ] = None
295+ q3 : Optional [int | float ] = None
296+
297+
298+ def distribution_check (params : DistributionParams ) -> List [Check ]:
299+ res = []
300+
301+ if params .min_value :
302+ res .append (
303+ Check (lambda s : pd .to_numeric (s , errors = "coerce" ).min () >= params .min_value )
304+ )
305+ if params .max_value :
306+ res .append (
307+ Check (lambda s : pd .to_numeric (s , errors = "coerce" ).max () <= params .max_value )
308+ )
309+ if params .mean :
310+ res .append (
311+ Check (
312+ lambda s : pd .to_numeric (s , errors = "coerce" ).mean () >= 0.8 * params .mean
313+ and pd .to_numeric (s , errors = "coerce" ).mean () <= 1.2 * params .mean ,
314+ error = f"Column mean should be roughly { params .mean } " ,
315+ )
316+ )
317+ if params .median :
318+ res .append (
319+ Check (
320+ lambda s : pd .to_numeric (s , errors = "coerce" ).quantile (0.5 )
321+ >= 0.8 * params .median
322+ and pd .to_numeric (s , errors = "coerce" ).quantile (0.5 )
323+ <= 1.2 * params .median ,
324+ error = f"Column median should be roughly { params .median } " ,
325+ )
326+ )
327+ if params .std :
328+ res .append (
329+ Check (
330+ lambda s : pd .to_numeric (s , errors = "coerce" ).std () >= 0.8 * params .std
331+ and pd .to_numeric (s , errors = "coerce" ).std () <= 1.2 * params .std ,
332+ error = f"Column standard deviation should be roughly { params .std } " ,
333+ )
334+ )
335+ if params .q1 :
336+ res .append (
337+ Check (
338+ lambda s : pd .to_numeric (s , errors = "coerce" ).quantile (0.25 )
339+ >= 0.8 * params .q1
340+ and pd .to_numeric (s , errors = "coerce" ).quantile (0.25 ) <= 1.2 * params .q1 ,
341+ error = f"Column first quantile should be roughly { params .q1 } " ,
342+ )
343+ )
344+ if params .q3 :
345+ res .append (
346+ Check (
347+ lambda s : pd .to_numeric (s , errors = "coerce" ).quantile (0.75 )
348+ >= 0.8 * params .q3
349+ and pd .to_numeric (s , errors = "coerce" ).quantile (0.75 ) <= 1.2 * params .q3 ,
350+ error = f"Column third quantile should be roughly { params .q3 } " ,
351+ )
352+ )
353+
354+ return res
0 commit comments