|
1 | 1 | import sys |
2 | | - |
3 | | -from new_etl.data_utils.access_process import access_process |
4 | | -from new_etl.data_utils.contig_neighbors import contig_neighbors |
5 | | -from new_etl.data_utils.dev_probability import dev_probability |
6 | | -from new_etl.data_utils.negligent_devs import negligent_devs |
7 | | -from new_etl.data_utils.opa_properties import opa_properties |
8 | | -from new_etl.data_utils.priority_level import priority_level |
9 | | -from new_etl.data_utils.vacant_properties import vacant_properties |
10 | | -from new_etl.data_utils.pwd_parcels import pwd_parcels |
11 | | -from new_etl.data_utils.city_owned_properties import city_owned_properties |
12 | | -from new_etl.data_utils.phs_properties import phs_properties |
13 | | -from new_etl.data_utils.li_violations import li_violations |
14 | | -from new_etl.data_utils.li_complaints import li_complaints |
15 | | -from new_etl.data_utils.rco_geoms import rco_geoms |
16 | | -from new_etl.data_utils.council_dists import council_dists |
17 | | -from new_etl.data_utils.tree_canopy import tree_canopy |
18 | | -from new_etl.data_utils.nbhoods import nbhoods |
19 | | -from new_etl.data_utils.gun_crimes import gun_crimes |
20 | | -from new_etl.data_utils.drug_crimes import drug_crimes |
21 | | -from new_etl.data_utils.delinquencies import delinquencies |
22 | | -from new_etl.data_utils.unsafe_buildings import unsafe_buildings |
23 | | -from new_etl.data_utils.imm_dang_buildings import imm_dang_buildings |
24 | | -from new_etl.data_utils.tactical_urbanism import tactical_urbanism |
25 | | -from new_etl.data_utils.conservatorship import conservatorship |
26 | | -from new_etl.data_utils.owner_type import owner_type |
27 | | -from new_etl.data_utils.community_gardens import community_gardens |
28 | | -from new_etl.data_utils.park_priority import park_priority |
29 | | -from new_etl.data_utils.ppr_properties import ppr_properties |
30 | | - |
31 | 2 | import pandas as pd |
| 3 | +import traceback |
32 | 4 |
|
| 5 | +from config.psql import conn |
| 6 | +from config.config import tiles_file_id_prefix |
| 7 | + |
| 8 | +from new_etl.classes.slack_reporters import send_dataframe_profile_to_slack, send_pg_stats_to_slack, send_error_to_slack |
| 9 | +from new_etl.classes.data_diff import DiffReport |
| 10 | +from new_etl.data_utils import * |
| 11 | +from new_etl.database import to_postgis_with_schema |
33 | 12 |
|
34 | 13 | # Ensure the directory containing awkde is in the Python path |
35 | 14 | awkde_path = "/usr/src/app" |
36 | 15 | if awkde_path not in sys.path: |
37 | 16 | sys.path.append(awkde_path) |
38 | 17 |
|
39 | | -services = [ |
40 | | - # vacant designation |
41 | | - vacant_properties, # needs to run early so that other utils can make use of the `vacant` designation |
42 | | - # geometries/areas |
43 | | - pwd_parcels, |
44 | | - council_dists, |
45 | | - nbhoods, |
46 | | - rco_geoms, |
47 | | - # ownership |
48 | | - city_owned_properties, |
49 | | - phs_properties, |
50 | | - community_gardens, |
51 | | - ppr_properties, |
52 | | - owner_type, |
53 | | - # quality of life |
54 | | - li_violations, |
55 | | - li_complaints, |
56 | | - tree_canopy, |
57 | | - gun_crimes, |
58 | | - drug_crimes, |
59 | | - delinquencies, |
60 | | - unsafe_buildings, |
61 | | - imm_dang_buildings, |
62 | | - # development |
63 | | - contig_neighbors, |
64 | | - dev_probability, |
65 | | - negligent_devs, |
66 | | - # access/interventions |
67 | | - tactical_urbanism, |
68 | | - conservatorship, |
69 | | - park_priority, |
70 | | -] |
71 | | - |
72 | | -dataset = opa_properties() |
73 | | - |
74 | | -print("Initial Dataset:") |
75 | | -print("Shape:", dataset.gdf.shape) |
76 | | -print("Head:\n", dataset.gdf.head()) |
77 | | -print("NA Counts:\n", dataset.gdf.isna().sum()) |
78 | | - |
79 | | -for service in services: |
80 | | - dataset = service(dataset) |
81 | | - print(f"After {service.__name__}:") |
82 | | - print("Dataset type:", type(dataset.gdf).__name__) |
83 | | - print("Shape:", dataset.gdf.shape) |
84 | | - print("Head:\n", dataset.gdf.head()) |
85 | | - print("NA Counts:\n", dataset.gdf.isna().sum()) |
86 | | - |
87 | | -before_drop = dataset.gdf.shape[0] |
88 | | -dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") |
89 | | -after_drop = dataset.gdf.shape[0] |
90 | | -print( |
91 | | - f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}" |
92 | | -) |
93 | | - |
94 | | -# Add Priority Level |
95 | | -dataset = priority_level(dataset) |
96 | | - |
97 | | -# Print the distribution of "priority_level" |
98 | | -distribution = dataset.gdf["priority_level"].value_counts() |
99 | | -print("Distribution of priority level:") |
100 | | -print(distribution) |
101 | | - |
102 | | -# Add Access Process |
103 | | -dataset = access_process(dataset) |
104 | | - |
105 | | -# Print the distribution of "access_process" |
106 | | -distribution = dataset.gdf["access_process"].value_counts() |
107 | | -print("Distribution of access process:") |
108 | | -print(distribution) |
109 | | - |
110 | | -before_drop = dataset.gdf.shape[0] |
111 | | -dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") |
112 | | -after_drop = dataset.gdf.shape[0] |
113 | | -print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}") |
114 | | - |
115 | | -# Convert problematic columns to numeric |
116 | | -numeric_columns = [ |
117 | | - "market_value", |
118 | | - "sale_price", |
119 | | - "total_assessment", |
120 | | - "total_due", |
121 | | - "num_years_owed", |
122 | | - "permit_count", |
123 | | -] |
124 | | -for col in numeric_columns: |
125 | | - dataset.gdf[col] = pd.to_numeric(dataset.gdf[col], errors="coerce") |
126 | | - |
127 | | -dataset.gdf["most_recent_year_owed"] = dataset.gdf["most_recent_year_owed"].astype(str) |
128 | | - |
129 | | -print("Column data types before exporting to Parquet:") |
130 | | -print(dataset.gdf.dtypes) |
131 | | - |
132 | | -# Quick dataset profiling |
133 | | -print("\nQuick dataset profile:") |
134 | | - |
135 | | -# 1) Number of NA values per column |
136 | | -print("\nNumber of NA values per column:") |
137 | | -print(dataset.gdf.isna().sum()) |
138 | | - |
139 | | -# 2) Mean, median, and std of numeric columns |
140 | | -print("\nMean, Median, and Standard Deviation of numeric columns:") |
141 | | -numeric_columns = dataset.gdf.select_dtypes(include=["float", "int"]).columns |
142 | | - |
143 | | -for column in numeric_columns: |
144 | | - mean = dataset.gdf[column].mean() |
145 | | - median = dataset.gdf[column].median() |
146 | | - std = dataset.gdf[column].std() |
147 | | - print(f"{column}:\n Mean: {mean:.2f}\n Median: {median:.2f}\n Std: {std:.2f}") |
148 | | - |
149 | | -# 3) Number of unique values in string columns |
150 | | -print("\nNumber of unique values in string columns:") |
151 | | -string_columns = dataset.gdf.select_dtypes(include=["object", "string"]).columns |
152 | | -unique_values = dataset.gdf[string_columns].nunique() |
153 | | -print(unique_values) |
154 | | - |
155 | | -dataset.gdf.to_parquet("tmp/test_output.parquet") |
| 18 | + |
| 19 | +try: |
| 20 | + |
| 21 | + print("Starting ETL process.") |
| 22 | + |
| 23 | + services = [ |
| 24 | + vacant_properties, # Run early for other utils to use the `vacant` designation |
| 25 | + pwd_parcels, |
| 26 | + council_dists, |
| 27 | + nbhoods, |
| 28 | + rco_geoms, |
| 29 | + city_owned_properties, |
| 30 | + phs_properties, |
| 31 | + community_gardens, |
| 32 | + ppr_properties, |
| 33 | + owner_type, |
| 34 | + li_violations, |
| 35 | + li_complaints, |
| 36 | + tree_canopy, |
| 37 | + gun_crimes, |
| 38 | + drug_crimes, |
| 39 | + delinquencies, |
| 40 | + unsafe_buildings, |
| 41 | + imm_dang_buildings, |
| 42 | + contig_neighbors, |
| 43 | + dev_probability, |
| 44 | + negligent_devs, |
| 45 | + tactical_urbanism, |
| 46 | + conservatorship, |
| 47 | + park_priority, |
| 48 | + ] |
| 49 | + |
| 50 | + print("Loading OPA properties dataset.") |
| 51 | + dataset = opa_properties() |
| 52 | + |
| 53 | + for service in services: |
| 54 | + print(f"Running service: {service.__name__}") |
| 55 | + dataset = service(dataset) |
| 56 | + |
| 57 | + print("Applying final dataset transformations.") |
| 58 | + dataset = priority_level(dataset) |
| 59 | + dataset = access_process(dataset) |
| 60 | + |
| 61 | + # Drop duplicates |
| 62 | + before_drop = dataset.gdf.shape[0] |
| 63 | + dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") |
| 64 | + print(f"Duplicate rows dropped: {before_drop - dataset.gdf.shape[0]}") |
| 65 | + |
| 66 | + # Convert columns to numeric where necessary |
| 67 | + numeric_columns = [ |
| 68 | + "market_value", |
| 69 | + "sale_price", |
| 70 | + "total_assessment", |
| 71 | + "total_due", |
| 72 | + "num_years_owed", |
| 73 | + "permit_count", |
| 74 | + ] |
| 75 | + dataset.gdf[numeric_columns] = dataset.gdf[numeric_columns].apply(pd.to_numeric, errors="coerce") |
| 76 | + dataset.gdf["most_recent_year_owed"] = dataset.gdf["most_recent_year_owed"].astype(str) |
| 77 | + |
| 78 | + # Dataset profiling |
| 79 | + send_dataframe_profile_to_slack(dataset.gdf, "all_properties_end") |
| 80 | + |
| 81 | + # Save dataset to PostgreSQL |
| 82 | + to_postgis_with_schema(dataset.gdf, "all_properties_end", conn) |
| 83 | + |
| 84 | + # Generate and send diff report |
| 85 | + diff_report = DiffReport() |
| 86 | + diff_report.run() |
| 87 | + |
| 88 | + send_pg_stats_to_slack(conn) # Send PostgreSQL stats to Slack |
| 89 | + |
| 90 | + # Save local Parquet file |
| 91 | + parquet_path = "tmp/test_output.parquet" |
| 92 | + dataset.gdf.to_parquet(parquet_path) |
| 93 | + print(f"Dataset saved to Parquet: {parquet_path}") |
| 94 | + |
| 95 | + # Publish only vacant properties |
| 96 | + dataset.gdf = dataset.gdf[dataset.gdf["vacant"]] |
| 97 | + dataset.build_and_publish(tiles_file_id_prefix) |
| 98 | + |
| 99 | + # Finalize |
| 100 | + conn.commit() |
| 101 | + conn.close() |
| 102 | + print("ETL process completed successfully.") |
| 103 | + |
| 104 | +except Exception as e: |
| 105 | + error_message = f"Error in backend job: {str(e)}\n\n{traceback.format_exc()}" |
| 106 | + send_error_to_slack(error_message) |
| 107 | + raise # Optionally re-raise the exception |
0 commit comments