Skip to content

Commit 56fc5fb

Browse files
authored
Merge pull request #1037 from CodeForPhilly/lebovits/issu1015-cleanup-new-pipeline
Lebovits/issu1015 cleanup new pipeline
2 parents 3cefa5d + 6e82443 commit 56fc5fb

31 files changed

+2249
-1881
lines changed

data/src/Pipfile

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,29 +13,29 @@ matplotlib = "*"
1313
rasterio = "*"
1414
scikit-learn = "*"
1515
mapclassify = "*"
16-
black = "*"
1716
fiona = "*"
1817
esridump = "*"
1918
sqlalchemy = "*"
2019
psycopg2-binary = "*"
21-
geoalchemy2 = "*"
2220
mapbox = "*"
2321
google-cloud-storage = "*"
2422
pydantic = "==2.8.2"
25-
data-diff = {extras = ["postgresql"], version = "*"}
2623
future = "*"
2724
slack-sdk = "*"
28-
pytest = "*"
2925
networkx = "*"
3026
libpysal = "*"
3127
jenkspy = "*"
3228
pyarrow = "*"
3329
tqdm = "*"
30+
geoalchemy2 ="*"
31+
32+
[dev-packages]
33+
black = "*"
34+
pytest = "*"
3435
vulture = "*"
3536
pylint = "*"
3637
radon = "*"
37-
38-
[dev-packages]
38+
ruff = "*"
3939

4040
[requires]
4141
python_version = "3.11"

data/src/Pipfile.lock

Lines changed: 1157 additions & 1442 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

data/src/main.py

Lines changed: 98 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -1,155 +1,107 @@
11
import sys
2-
3-
from new_etl.data_utils.access_process import access_process
4-
from new_etl.data_utils.contig_neighbors import contig_neighbors
5-
from new_etl.data_utils.dev_probability import dev_probability
6-
from new_etl.data_utils.negligent_devs import negligent_devs
7-
from new_etl.data_utils.opa_properties import opa_properties
8-
from new_etl.data_utils.priority_level import priority_level
9-
from new_etl.data_utils.vacant_properties import vacant_properties
10-
from new_etl.data_utils.pwd_parcels import pwd_parcels
11-
from new_etl.data_utils.city_owned_properties import city_owned_properties
12-
from new_etl.data_utils.phs_properties import phs_properties
13-
from new_etl.data_utils.li_violations import li_violations
14-
from new_etl.data_utils.li_complaints import li_complaints
15-
from new_etl.data_utils.rco_geoms import rco_geoms
16-
from new_etl.data_utils.council_dists import council_dists
17-
from new_etl.data_utils.tree_canopy import tree_canopy
18-
from new_etl.data_utils.nbhoods import nbhoods
19-
from new_etl.data_utils.gun_crimes import gun_crimes
20-
from new_etl.data_utils.drug_crimes import drug_crimes
21-
from new_etl.data_utils.delinquencies import delinquencies
22-
from new_etl.data_utils.unsafe_buildings import unsafe_buildings
23-
from new_etl.data_utils.imm_dang_buildings import imm_dang_buildings
24-
from new_etl.data_utils.tactical_urbanism import tactical_urbanism
25-
from new_etl.data_utils.conservatorship import conservatorship
26-
from new_etl.data_utils.owner_type import owner_type
27-
from new_etl.data_utils.community_gardens import community_gardens
28-
from new_etl.data_utils.park_priority import park_priority
29-
from new_etl.data_utils.ppr_properties import ppr_properties
30-
312
import pandas as pd
3+
import traceback
324

5+
from config.psql import conn
6+
from config.config import tiles_file_id_prefix
7+
8+
from new_etl.classes.slack_reporters import send_dataframe_profile_to_slack, send_pg_stats_to_slack, send_error_to_slack
9+
from new_etl.classes.data_diff import DiffReport
10+
from new_etl.data_utils import *
11+
from new_etl.database import to_postgis_with_schema
3312

3413
# Ensure the directory containing awkde is in the Python path
3514
awkde_path = "/usr/src/app"
3615
if awkde_path not in sys.path:
3716
sys.path.append(awkde_path)
3817

39-
services = [
40-
# vacant designation
41-
vacant_properties, # needs to run early so that other utils can make use of the `vacant` designation
42-
# geometries/areas
43-
pwd_parcels,
44-
council_dists,
45-
nbhoods,
46-
rco_geoms,
47-
# ownership
48-
city_owned_properties,
49-
phs_properties,
50-
community_gardens,
51-
ppr_properties,
52-
owner_type,
53-
# quality of life
54-
li_violations,
55-
li_complaints,
56-
tree_canopy,
57-
gun_crimes,
58-
drug_crimes,
59-
delinquencies,
60-
unsafe_buildings,
61-
imm_dang_buildings,
62-
# development
63-
contig_neighbors,
64-
dev_probability,
65-
negligent_devs,
66-
# access/interventions
67-
tactical_urbanism,
68-
conservatorship,
69-
park_priority,
70-
]
71-
72-
dataset = opa_properties()
73-
74-
print("Initial Dataset:")
75-
print("Shape:", dataset.gdf.shape)
76-
print("Head:\n", dataset.gdf.head())
77-
print("NA Counts:\n", dataset.gdf.isna().sum())
78-
79-
for service in services:
80-
dataset = service(dataset)
81-
print(f"After {service.__name__}:")
82-
print("Dataset type:", type(dataset.gdf).__name__)
83-
print("Shape:", dataset.gdf.shape)
84-
print("Head:\n", dataset.gdf.head())
85-
print("NA Counts:\n", dataset.gdf.isna().sum())
86-
87-
before_drop = dataset.gdf.shape[0]
88-
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
89-
after_drop = dataset.gdf.shape[0]
90-
print(
91-
f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}"
92-
)
93-
94-
# Add Priority Level
95-
dataset = priority_level(dataset)
96-
97-
# Print the distribution of "priority_level"
98-
distribution = dataset.gdf["priority_level"].value_counts()
99-
print("Distribution of priority level:")
100-
print(distribution)
101-
102-
# Add Access Process
103-
dataset = access_process(dataset)
104-
105-
# Print the distribution of "access_process"
106-
distribution = dataset.gdf["access_process"].value_counts()
107-
print("Distribution of access process:")
108-
print(distribution)
109-
110-
before_drop = dataset.gdf.shape[0]
111-
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
112-
after_drop = dataset.gdf.shape[0]
113-
print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}")
114-
115-
# Convert problematic columns to numeric
116-
numeric_columns = [
117-
"market_value",
118-
"sale_price",
119-
"total_assessment",
120-
"total_due",
121-
"num_years_owed",
122-
"permit_count",
123-
]
124-
for col in numeric_columns:
125-
dataset.gdf[col] = pd.to_numeric(dataset.gdf[col], errors="coerce")
126-
127-
dataset.gdf["most_recent_year_owed"] = dataset.gdf["most_recent_year_owed"].astype(str)
128-
129-
print("Column data types before exporting to Parquet:")
130-
print(dataset.gdf.dtypes)
131-
132-
# Quick dataset profiling
133-
print("\nQuick dataset profile:")
134-
135-
# 1) Number of NA values per column
136-
print("\nNumber of NA values per column:")
137-
print(dataset.gdf.isna().sum())
138-
139-
# 2) Mean, median, and std of numeric columns
140-
print("\nMean, Median, and Standard Deviation of numeric columns:")
141-
numeric_columns = dataset.gdf.select_dtypes(include=["float", "int"]).columns
142-
143-
for column in numeric_columns:
144-
mean = dataset.gdf[column].mean()
145-
median = dataset.gdf[column].median()
146-
std = dataset.gdf[column].std()
147-
print(f"{column}:\n Mean: {mean:.2f}\n Median: {median:.2f}\n Std: {std:.2f}")
148-
149-
# 3) Number of unique values in string columns
150-
print("\nNumber of unique values in string columns:")
151-
string_columns = dataset.gdf.select_dtypes(include=["object", "string"]).columns
152-
unique_values = dataset.gdf[string_columns].nunique()
153-
print(unique_values)
154-
155-
dataset.gdf.to_parquet("tmp/test_output.parquet")
18+
19+
try:
20+
21+
print("Starting ETL process.")
22+
23+
services = [
24+
vacant_properties, # Run early for other utils to use the `vacant` designation
25+
pwd_parcels,
26+
council_dists,
27+
nbhoods,
28+
rco_geoms,
29+
city_owned_properties,
30+
phs_properties,
31+
community_gardens,
32+
ppr_properties,
33+
owner_type,
34+
li_violations,
35+
li_complaints,
36+
tree_canopy,
37+
gun_crimes,
38+
drug_crimes,
39+
delinquencies,
40+
unsafe_buildings,
41+
imm_dang_buildings,
42+
contig_neighbors,
43+
dev_probability,
44+
negligent_devs,
45+
tactical_urbanism,
46+
conservatorship,
47+
park_priority,
48+
]
49+
50+
print("Loading OPA properties dataset.")
51+
dataset = opa_properties()
52+
53+
for service in services:
54+
print(f"Running service: {service.__name__}")
55+
dataset = service(dataset)
56+
57+
print("Applying final dataset transformations.")
58+
dataset = priority_level(dataset)
59+
dataset = access_process(dataset)
60+
61+
# Drop duplicates
62+
before_drop = dataset.gdf.shape[0]
63+
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
64+
print(f"Duplicate rows dropped: {before_drop - dataset.gdf.shape[0]}")
65+
66+
# Convert columns to numeric where necessary
67+
numeric_columns = [
68+
"market_value",
69+
"sale_price",
70+
"total_assessment",
71+
"total_due",
72+
"num_years_owed",
73+
"permit_count",
74+
]
75+
dataset.gdf[numeric_columns] = dataset.gdf[numeric_columns].apply(pd.to_numeric, errors="coerce")
76+
dataset.gdf["most_recent_year_owed"] = dataset.gdf["most_recent_year_owed"].astype(str)
77+
78+
# Dataset profiling
79+
send_dataframe_profile_to_slack(dataset.gdf, "all_properties_end")
80+
81+
# Save dataset to PostgreSQL
82+
to_postgis_with_schema(dataset.gdf, "all_properties_end", conn)
83+
84+
# Generate and send diff report
85+
diff_report = DiffReport()
86+
diff_report.run()
87+
88+
send_pg_stats_to_slack(conn) # Send PostgreSQL stats to Slack
89+
90+
# Save local Parquet file
91+
parquet_path = "tmp/test_output.parquet"
92+
dataset.gdf.to_parquet(parquet_path)
93+
print(f"Dataset saved to Parquet: {parquet_path}")
94+
95+
# Publish only vacant properties
96+
dataset.gdf = dataset.gdf[dataset.gdf["vacant"]]
97+
dataset.build_and_publish(tiles_file_id_prefix)
98+
99+
# Finalize
100+
conn.commit()
101+
conn.close()
102+
print("ETL process completed successfully.")
103+
104+
except Exception as e:
105+
error_message = f"Error in backend job: {str(e)}\n\n{traceback.format_exc()}"
106+
send_error_to_slack(error_message)
107+
raise # Optionally re-raise the exception

0 commit comments

Comments
 (0)