Skip to content

Commit 0e3647b

Browse files
committed
test: add validator, validator unit tests for contig neighbors
1 parent f303e2d commit 0e3647b

File tree

6 files changed

+775
-43
lines changed

6 files changed

+775
-43
lines changed

data/analyze_contig_neighbors.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Simple analysis script for contig_neighbors data.
4+
"""
5+
6+
from src.data_utils import opa_properties, pwd_parcels, vacant_properties
7+
from src.data_utils.contig_neighbors import contig_neighbors
8+
9+
10+
def analyze_contig_neighbors():
11+
"""Analyze the contig_neighbors data to see actual statistics."""
12+
13+
print("Loading base dataset...")
14+
dataset, _ = opa_properties()
15+
16+
print("Running vacant_properties...")
17+
dataset, _ = vacant_properties(dataset)
18+
19+
print("Running pwd_parcels...")
20+
dataset, _ = pwd_parcels(dataset)
21+
22+
print("Running contig_neighbors (bypassing validation)...")
23+
# Call the undecorated function directly
24+
result_dataset, _ = contig_neighbors.__wrapped__(dataset)
25+
26+
print("\n" + "=" * 60)
27+
print("CONTIG NEIGHBORS ANALYSIS")
28+
print("=" * 60)
29+
30+
if "n_contiguous" in result_dataset.columns:
31+
# Get non-null values
32+
non_null_data = result_dataset["n_contiguous"].dropna()
33+
34+
print(f"\nTotal records: {len(result_dataset):,}")
35+
print(f"Non-null n_contiguous values: {len(non_null_data):,}")
36+
print(
37+
f"Null n_contiguous values: {result_dataset['n_contiguous'].isna().sum():,}"
38+
)
39+
40+
if len(non_null_data) > 0:
41+
print("\nActual Statistics:")
42+
print(f" Min: {non_null_data.min()}")
43+
print(f" Max: {non_null_data.max()}")
44+
print(f" Mean: {non_null_data.mean():.3f}")
45+
print(f" Std: {non_null_data.std():.3f}")
46+
print(f" Q1: {non_null_data.quantile(0.25):.3f}")
47+
print(f" Q3: {non_null_data.quantile(0.75):.3f}")
48+
49+
print("\nExpected Ranges:")
50+
print(f" Max: <= 49 (actual: {non_null_data.max()})")
51+
print(f" Mean: [2.05, 3.08] (actual: {non_null_data.mean():.3f})")
52+
print(f" Std: [3.90, 5.85] (actual: {non_null_data.std():.3f})")
53+
print(f" Q1: [0.00, 0.00] (actual: {non_null_data.quantile(0.25):.3f})")
54+
print(f" Q3: [2.40, 3.60] (actual: {non_null_data.quantile(0.75):.3f})")
55+
56+
print("\nValue Distribution:")
57+
value_counts = non_null_data.value_counts().sort_index()
58+
for value, count in value_counts.head(20).items():
59+
pct = (count / len(non_null_data)) * 100
60+
print(f" {value}: {count:,} ({pct:.1f}%)")
61+
62+
if len(value_counts) > 20:
63+
print(f" ... and {len(value_counts) - 20} more unique values")
64+
else:
65+
print("\nNo non-null n_contiguous values found!")
66+
else:
67+
print("\nNo n_contiguous column found!")
68+
69+
70+
if __name__ == "__main__":
71+
analyze_contig_neighbors()

data/src/data_utils/contig_neighbors.py

Lines changed: 126 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,44 @@ def contig_neighbors(
3535
Primary Feature Layer Columns Referenced:
3636
opa_id, vacant
3737
"""
38+
print(f"[DEBUG] contig_neighbors: Starting with {len(input_gdf)} properties")
39+
print(f"[DEBUG] contig_neighbors: Vacant properties: {input_gdf['vacant'].sum()}")
40+
41+
# Debug geometry types
42+
geometry_types = input_gdf.geometry.type.value_counts()
43+
print(
44+
f"[DEBUG] contig_neighbors: Geometry types in dataset: {dict(geometry_types)}"
45+
)
46+
47+
# Debug vacant properties geometry types
48+
vacant_gdf = input_gdf[input_gdf["vacant"]]
49+
if len(vacant_gdf) > 0:
50+
vacant_geometry_types = vacant_gdf.geometry.type.value_counts()
51+
print(
52+
f"[DEBUG] contig_neighbors: Vacant properties geometry types: {dict(vacant_geometry_types)}"
53+
)
54+
else:
55+
print("[DEBUG] contig_neighbors: No vacant properties found")
56+
3857
# Create a filtered dataframe with only vacant properties and polygon geometries
3958
vacant_parcels = input_gdf.loc[
4059
(input_gdf["vacant"])
4160
& (input_gdf.geometry.type.isin(["Polygon", "MultiPolygon"])),
4261
["opa_id", "geometry"],
4362
]
4463

64+
print(
65+
f"[DEBUG] contig_neighbors: Vacant parcels with valid geometry: {len(vacant_parcels)}"
66+
)
67+
4568
if vacant_parcels.empty:
4669
print("No vacant properties found in the dataset.")
4770
input_gdf["n_contiguous"] = np.nan
48-
return input_gdf
71+
print("[DEBUG] contig_neighbors: Returning single value (should be tuple)")
72+
result = input_gdf, ValidationResult(True)
73+
print(f"[DEBUG] contig_neighbors: Return type: {type(result)}")
74+
print(f"[DEBUG] contig_neighbors: Return length: {len(result)}")
75+
return result
4976

5077
with warnings.catch_warnings():
5178
warnings.filterwarnings("ignore", category=FutureWarning)
@@ -66,13 +93,110 @@ def contig_neighbors(
6693
node: len(nx.node_connected_component(g, node)) - 1 for node in g.nodes
6794
}
6895

96+
# Debug: Check what values we're getting
97+
print(
98+
f"[DEBUG] contig_neighbors: n_contiguous values calculated: {len(n_contiguous)}"
99+
)
100+
if len(n_contiguous) > 0:
101+
sample_values = list(n_contiguous.values())[:10]
102+
print(f"[DEBUG] contig_neighbors: Sample n_contiguous values: {sample_values}")
103+
print(
104+
f"[DEBUG] contig_neighbors: n_contiguous value types: {[type(v) for v in sample_values]}"
105+
)
106+
69107
# Assign the contiguous neighbor count to the filtered vacant parcels
70108
vacant_parcels["n_contiguous"] = vacant_parcels.index.map(n_contiguous)
71109

110+
# Debug: Check what's in vacant_parcels after assignment
111+
print(
112+
f"[DEBUG] contig_neighbors: vacant_parcels n_contiguous column: {vacant_parcels['n_contiguous'].dtype}"
113+
)
114+
print(
115+
f"[DEBUG] contig_neighbors: vacant_parcels n_contiguous sample: {vacant_parcels['n_contiguous'].head().tolist()}"
116+
)
117+
print(
118+
f"[DEBUG] contig_neighbors: vacant_parcels n_contiguous null count: {vacant_parcels['n_contiguous'].isna().sum()}"
119+
)
120+
121+
# Debug: Check for boolean values in vacant_parcels
122+
bool_mask = vacant_parcels["n_contiguous"].apply(lambda x: isinstance(x, bool))
123+
if bool_mask.any():
124+
print(
125+
f"[DEBUG] contig_neighbors: Found {bool_mask.sum()} boolean values in vacant_parcels!"
126+
)
127+
print(
128+
f"[DEBUG] contig_neighbors: Boolean values: {vacant_parcels.loc[bool_mask, 'n_contiguous'].tolist()}"
129+
)
130+
131+
# Debug: Check opa_id matching
132+
print(
133+
f"[DEBUG] contig_neighbors: vacant_parcels opa_id sample: {vacant_parcels['opa_id'].head().tolist()}"
134+
)
135+
print(
136+
f"[DEBUG] contig_neighbors: input_gdf opa_id sample: {input_gdf['opa_id'].head().tolist()}"
137+
)
138+
print(
139+
f"[DEBUG] contig_neighbors: vacant_parcels opa_id type: {vacant_parcels['opa_id'].dtype}"
140+
)
141+
print(
142+
f"[DEBUG] contig_neighbors: input_gdf opa_id type: {input_gdf['opa_id'].dtype}"
143+
)
144+
72145
# Merge the results back to the primary feature layer
73146
input_gdf = opa_join(input_gdf, vacant_parcels[["opa_id", "n_contiguous"]])
74147

148+
# Debug: Check what's in input_gdf after join
149+
print(
150+
f"[DEBUG] contig_neighbors: input_gdf n_contiguous column: {input_gdf['n_contiguous'].dtype}"
151+
)
152+
print(
153+
f"[DEBUG] contig_neighbors: input_gdf n_contiguous sample: {input_gdf['n_contiguous'].head().tolist()}"
154+
)
155+
print(
156+
f"[DEBUG] contig_neighbors: input_gdf n_contiguous null count: {input_gdf['n_contiguous'].isna().sum()}"
157+
)
158+
159+
# Debug: Check for boolean values in input_gdf
160+
bool_mask = input_gdf["n_contiguous"].apply(lambda x: isinstance(x, bool))
161+
if bool_mask.any():
162+
print(
163+
f"[DEBUG] contig_neighbors: Found {bool_mask.sum()} boolean values in input_gdf!"
164+
)
165+
print(
166+
f"[DEBUG] contig_neighbors: Boolean values: {input_gdf.loc[bool_mask, 'n_contiguous'].tolist()}"
167+
)
168+
169+
# Debug: Check if any non-null values exist
170+
non_null_mask = input_gdf["n_contiguous"].notna()
171+
if non_null_mask.any():
172+
print(
173+
f"[DEBUG] contig_neighbors: Found {non_null_mask.sum()} non-null n_contiguous values"
174+
)
175+
print(
176+
f"[DEBUG] contig_neighbors: Non-null sample: {input_gdf.loc[non_null_mask, 'n_contiguous'].head().tolist()}"
177+
)
178+
else:
179+
print(
180+
"[DEBUG] contig_neighbors: No non-null n_contiguous values found after join"
181+
)
182+
75183
# Assign NA for non-vacant properties
76184
input_gdf.loc[~input_gdf["vacant"], "n_contiguous"] = np.nan
77185

78-
return input_gdf, ValidationResult(True)
186+
# Final check: Ensure no boolean values remain
187+
final_bool_mask = input_gdf["n_contiguous"].apply(lambda x: isinstance(x, bool))
188+
if final_bool_mask.any():
189+
print(
190+
f"[DEBUG] contig_neighbors: WARNING - Found {final_bool_mask.sum()} boolean values in final result!"
191+
)
192+
print("[DEBUG] contig_neighbors: Converting boolean values to numeric...")
193+
# Convert boolean False to 0, True to 1
194+
input_gdf.loc[final_bool_mask, "n_contiguous"] = input_gdf.loc[
195+
final_bool_mask, "n_contiguous"
196+
].astype(int)
197+
198+
print("[DEBUG] contig_neighbors: Returning tuple with ValidationResult")
199+
result = input_gdf, ValidationResult(True)
200+
print(f"[DEBUG] contig_neighbors: Return type: {type(result)}")
201+
print(f"[DEBUG] contig_neighbors: Return length: {len(result)}")
202+
return result

0 commit comments

Comments
 (0)