Skip to content

Commit 0ac5bae

Browse files
committed
test: add delinquencies validator, validator unit tests
1 parent 81f7413 commit 0ac5bae

File tree

8 files changed

+834
-50
lines changed

8 files changed

+834
-50
lines changed

data/analyze_delinquencies.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Simple script to analyze delinquencies data directly from cache.
4+
"""
5+
6+
import pandas as pd
7+
8+
from src.classes.file_manager import FileManager
9+
10+
11+
def analyze_delinquencies():
12+
"""Load and analyze delinquencies data from cache."""
13+
14+
print("=" * 80)
15+
print("DELINQUENCIES DATA ANALYSIS")
16+
print("=" * 80)
17+
18+
# Load the cached delinquencies data
19+
file_manager = FileManager()
20+
21+
# Try to load the most recent cache
22+
try:
23+
gdf = file_manager.get_most_recent_cache("property_tax_delinquencies")
24+
if gdf is None:
25+
print("No cached delinquencies data found!")
26+
return
27+
except Exception as e:
28+
print(f"Error loading cached data: {e}")
29+
return
30+
31+
print(f"\nTotal records: {len(gdf):,}")
32+
print(f"Columns: {list(gdf.columns)}")
33+
34+
# Check for the specific columns we care about
35+
columns_to_check = [
36+
"num_years_owed",
37+
"total_due",
38+
"total_assessment",
39+
"is_actionable",
40+
"sheriff_sale",
41+
"payment_agreement",
42+
"most_recent_year_owed",
43+
]
44+
45+
print("\nData types:")
46+
for col in columns_to_check:
47+
if col in gdf.columns:
48+
print(f" {col}: {gdf[col].dtype}")
49+
else:
50+
print(f" {col}: NOT FOUND")
51+
52+
print("\nColumn coverage:")
53+
for col in columns_to_check:
54+
if col in gdf.columns:
55+
# Check for "NA" strings vs actual nulls
56+
na_strings = (gdf[col] == "NA").sum()
57+
actual_nulls = gdf[col].isnull().sum()
58+
non_null_count = gdf[col].notna().sum()
59+
coverage_pct = (non_null_count / len(gdf)) * 100
60+
61+
print(
62+
f" {col}: {non_null_count:,} ({coverage_pct:.1f}%) - 'NA' strings: {na_strings:,}, actual nulls: {actual_nulls:,}"
63+
)
64+
65+
# Show sample values
66+
print(f" Sample values: {gdf[col].head(5).tolist()}")
67+
else:
68+
print(f" {col}: NOT FOUND")
69+
70+
# Analyze numeric columns
71+
numeric_columns = ["num_years_owed", "total_due", "total_assessment"]
72+
for col in numeric_columns:
73+
if col in gdf.columns:
74+
print(f"\n{col} analysis:")
75+
76+
# Convert "NA" strings to nulls for analysis
77+
data = gdf[col].replace("NA", pd.NA)
78+
numeric_data = pd.to_numeric(data, errors="coerce")
79+
non_null_data = numeric_data.dropna()
80+
81+
if len(non_null_data) > 0:
82+
stats = non_null_data.describe()
83+
print(f" Count: {len(non_null_data):,}")
84+
print(f" Mean: {stats['mean']:.2f}")
85+
print(f" Std: {stats['std']:.2f}")
86+
print(f" Min: {stats['min']:.2f}")
87+
print(f" Max: {stats['max']:.2f}")
88+
print(f" Q1: {stats['25%']:.2f}")
89+
print(f" Q3: {stats['75%']:.2f}")
90+
91+
# Show some actual values
92+
print(f" Sample values: {non_null_data.head(10).tolist()}")
93+
else:
94+
print(" No valid numeric data found")
95+
96+
# Analyze boolean columns
97+
boolean_columns = ["is_actionable", "sheriff_sale", "payment_agreement"]
98+
for col in boolean_columns:
99+
if col in gdf.columns:
100+
print(f"\n{col} analysis:")
101+
value_counts = gdf[col].value_counts()
102+
for value, count in value_counts.items():
103+
pct = (count / len(gdf)) * 100
104+
print(f" {value}: {count:,} ({pct:.1f}%)")
105+
106+
# Show sample values
107+
print(f" Sample values: {gdf[col].head(10).tolist()}")
108+
109+
print("\n" + "=" * 80)
110+
111+
112+
if __name__ == "__main__":
113+
analyze_delinquencies()

data/src/config/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from contextlib import contextmanager
44
from pathlib import Path
55

6-
FORCE_RELOAD = True
6+
FORCE_RELOAD = False
77
""" During the data load, whether to query the various GIS API services for the data to load. If True, will query the
88
API services and report on data differences. If false will read the cached data."""
99

data/src/data_utils/delinquencies.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,11 @@ def delinquencies(
102102
["nan", "None", ""], pd.NaT
103103
)
104104

105-
# Convert valid values to datetime
105+
# Convert valid values to datetime with explicit format
106106
merged_gdf["most_recent_year_owed"] = pd.to_datetime(
107-
merged_gdf["most_recent_year_owed"].astype(str) + "-12-31", errors="coerce"
107+
merged_gdf["most_recent_year_owed"].astype(str) + "-12-31",
108+
format="%Y-%m-%d",
109+
errors="coerce",
108110
)
109111

110112
# Fill missing values with "NA" for string columns

0 commit comments

Comments
 (0)