Skip to content

Commit d98f7dc

Browse files
authored
Fix backend formatting checks (#1175)
* fix: add --check flag for formatting CI check * fix: format files with ruff
1 parent ba5a371 commit d98f7dc

File tree

16 files changed

+146
-86
lines changed

16 files changed

+146
-86
lines changed

.github/workflows/pr_checks_backend.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ jobs:
8888
- name: Run Ruff Formatter in Docker
8989
run: |
9090
cd data
91-
docker compose run --rm formatter
91+
docker compose run --rm formatter sh -c "pip install ruff && ruff format --check --exclude '/usr/src/app/awkde/'"
9292
9393
run-linter:
9494
runs-on: ubuntu-latest

data/src/classes/backup_archive_database.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
from datetime import datetime, timedelta
55

66
import sqlalchemy as sa
7-
from config.config import log_level, max_backup_schema_days, tiles_file_id_prefix, tile_file_backup_directory
7+
from config.config import (
8+
log_level,
9+
max_backup_schema_days,
10+
tiles_file_id_prefix,
11+
tile_file_backup_directory,
12+
)
813
from config.psql import conn, local_engine, url
914
from data_utils.utils import mask_password
1015
from sqlalchemy import inspect
@@ -39,13 +44,15 @@ def backup_schema(self):
3944
"pg_dump "
4045
+ url
4146
+ " -s --schema public | "
42-
+ " sed 's/public/" + backup_schema_name + "/g'"
47+
+ " sed 's/public/"
48+
+ backup_schema_name
49+
+ "/g'"
4350
+ " | sed 's/"
4451
+ backup_schema_name
4552
+ ".geometry/public.geometry/' | sed 's/"
4653
+ backup_schema_name
4754
+ ".spatial_ref_sys/public.spatial_ref_sys/'"
48-
+ " | sed 's/backup__/public_/g'" # ppr_properties.public_name column needs to be restored.
55+
+ " | sed 's/backup__/public_/g'" # ppr_properties.public_name column needs to be restored.
4956
+ " | psql -v ON_ERROR_STOP=1 "
5057
+ url
5158
+ " > /dev/null "
@@ -109,24 +116,25 @@ def prune_old_archives(self):
109116
conn.execute(sa.DDL(sql))
110117

111118
def is_backup_schema_exists(self) -> bool:
112-
""" whether the backup schema exists
119+
"""whether the backup schema exists
113120
114121
Returns:
115122
bool: whether true
116-
"""
123+
"""
117124
return backup_schema_name in inspect(local_engine).get_schema_names()
118-
125+
119126
def backup_tiles_file(self):
120-
"""backup the main tiles file to a timestamped copy in the backup/ folder in GCP
121-
"""
127+
"""backup the main tiles file to a timestamped copy in the backup/ folder in GCP"""
122128
bucket = google_cloud_bucket()
123129
count: int = 0
124130
for blob in bucket.list_blobs(prefix=tiles_file_id_prefix):
125-
suffix: str = '_' + self.timestamp_string
131+
suffix: str = "_" + self.timestamp_string
126132
name, ext = os.path.splitext(blob.name)
127-
backup_file_name: str = tile_file_backup_directory + "/" + name + suffix + ext
133+
backup_file_name: str = (
134+
tile_file_backup_directory + "/" + name + suffix + ext
135+
)
128136
log.debug(backup_file_name)
129-
bucket.copy_blob(blob,destination_bucket=bucket,new_name=backup_file_name)
137+
bucket.copy_blob(blob, destination_bucket=bucket, new_name=backup_file_name)
130138
count += 1
131139
if count == 0:
132140
log.warning("No files were found to back up.")

data/src/classes/diff_report.py

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@
2020

2121
log.basicConfig(level=log_level)
2222

23+
2324
class DiffTable:
24-
"""Metadata about a table to be run through data-diff
25-
"""
25+
"""Metadata about a table to be run through data-diff"""
26+
2627
def __init__(self, table: str, pk_cols: list[str], where: str = None):
2728
"""constructor
2829
@@ -35,6 +36,7 @@ def __init__(self, table: str, pk_cols: list[str], where: str = None):
3536
self.pk_cols = pk_cols
3637
self.where = where
3738

39+
3840
class DiffReport:
3941
"""
4042
Class to manage computing data differences for all tables between the newly imported schema and the last schema. Build a report of summary differences for all tables. Log detailed differences to a table in the old backed-up schema. Post difference summary to Slack and or email.
@@ -48,15 +50,23 @@ def __init__(self, timestamp_string: str = None):
4850
"""
4951
self.diff_tables = self._list_diff_tables()
5052
self.timestamp_string = timestamp_string
51-
self.report: str = "The back-end data has been fully refreshed. Here is the difference report on " + str(len(self.diff_tables)) + " key tables.\nLegend: table A = new data, table B = old data.\n\n"
53+
self.report: str = (
54+
"The back-end data has been fully refreshed. Here is the difference report on "
55+
+ str(len(self.diff_tables))
56+
+ " key tables.\nLegend: table A = new data, table B = old data.\n\n"
57+
)
5258

5359
def run(self):
5460
"""
5561
run the report and slack or email it.
5662
"""
5763

5864
for diff_table in self.diff_tables:
59-
log.debug("Process table %s with pks %s", diff_table.table, str(diff_table.pk_cols))
65+
log.debug(
66+
"Process table %s with pks %s",
67+
diff_table.table,
68+
str(diff_table.pk_cols),
69+
)
6070
summary = diff_table.table + "\n" + self.compare_table(diff_table)
6171
# if no differences, do not report.
6272
if self._summary_shows_differences(summary):
@@ -141,11 +151,23 @@ def _list_diff_tables(self) -> list[DiffTable]:
141151
list[DiffTable]: the list of metadata
142152
"""
143153
return [
144-
DiffTable(table="vacant_properties",pk_cols=["opa_id", "parcel_type"],where="opa_id is not null"),
145-
DiffTable(table="li_complaints",pk_cols=["service_request_id"]),
146-
DiffTable(table="li_violations",pk_cols=["violationnumber", "opa_account_num"],where="opa_account_num is not null"),
147-
DiffTable(table="opa_properties",pk_cols=["parcel_number"]),
148-
DiffTable(table="property_tax_delinquencies",pk_cols=["opa_number"],where="opa_number <> 0")
154+
DiffTable(
155+
table="vacant_properties",
156+
pk_cols=["opa_id", "parcel_type"],
157+
where="opa_id is not null",
158+
),
159+
DiffTable(table="li_complaints", pk_cols=["service_request_id"]),
160+
DiffTable(
161+
table="li_violations",
162+
pk_cols=["violationnumber", "opa_account_num"],
163+
where="opa_account_num is not null",
164+
),
165+
DiffTable(table="opa_properties", pk_cols=["parcel_number"]),
166+
DiffTable(
167+
table="property_tax_delinquencies",
168+
pk_cols=["opa_number"],
169+
where="opa_number <> 0",
170+
),
149171
]
150172

151173
def compare_table(self, diff_table: DiffTable) -> str:
@@ -221,5 +243,3 @@ def email_report(self):
221243
s = smtplib.SMTP(smtp_server)
222244
s.sendmail(from_email, [report_to_email], msg.as_string())
223245
s.quit()
224-
225-

data/src/classes/featurelayer.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@ def google_cloud_bucket() -> Bucket:
4242
return storage_client.bucket(bucket_name)
4343

4444

45-
46-
4745
class FeatureLayer:
4846
"""
4947
FeatureLayer is a class to represent a GIS dataset. It can be initialized with a URL to an Esri Feature Service, a SQL query to Carto, or a GeoDataFrame.
@@ -60,7 +58,7 @@ def __init__(
6058
from_xy=False,
6159
use_wkb_geom_field=None,
6260
cols: list[str] = None,
63-
bucket: Bucket = None
61+
bucket: Bucket = None,
6462
):
6563
self.name = name
6664
self.esri_rest_urls = (
@@ -406,4 +404,4 @@ def build_and_publish(self, tiles_file_id_prefix: str) -> None:
406404
blob.upload_from_filename(temp_merged_pmtiles)
407405
print(f"PMTiles upload successful for {file}!")
408406
except Exception as e:
409-
print(f"PMTiles upload failed for {file}: {e}")
407+
print(f"PMTiles upload failed for {file}: {e}")

data/src/constants/services.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,4 @@
7575

7676
CENSUS_BGS_URL = (
7777
"https://opendata.arcgis.com/datasets/2f982bada233478ea0100528227febce_0.geojson"
78-
)
78+
)

data/src/data_utils/access_process.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,5 +39,5 @@ def access_process(dataset: Any) -> Any:
3939
access_processes.append(access_process)
4040

4141
dataset.gdf["access_process"] = access_processes
42-
42+
4343
return dataset

data/src/data_utils/community_gardens.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def community_gardens(primary_featurelayer):
99
)
1010

1111
community_gardens.gdf = community_gardens.gdf[["Site_Name", "geometry"]]
12-
12+
1313
primary_featurelayer.spatial_join(community_gardens)
1414

1515
# Create a boolean mask where 'site_Name' is not null

data/src/data_utils/l_and_i.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from classes.featurelayer import FeatureLayer
55
from constants.services import COMPLAINTS_SQL_QUERY, VIOLATIONS_SQL_QUERY
66

7+
78
def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
89
"""
910
Process L&I (Licenses and Inspections) data for complaints and violations.
@@ -19,20 +20,27 @@ def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
1920
FeatureLayer: The primary feature layer updated with L&I data.
2021
"""
2122
keywords: List[str] = [
22-
'dumping', 'blight', 'rubbish', 'weeds', 'graffiti',
23-
'abandoned', 'sanitation', 'litter', 'vacant', 'trash',
24-
'unsafe'
23+
"dumping",
24+
"blight",
25+
"rubbish",
26+
"weeds",
27+
"graffiti",
28+
"abandoned",
29+
"sanitation",
30+
"litter",
31+
"vacant",
32+
"trash",
33+
"unsafe",
2534
]
2635

2736
# Load complaints data from L&I
2837
l_and_i_complaints: FeatureLayer = FeatureLayer(
29-
name="LI Complaints",
30-
carto_sql_queries=COMPLAINTS_SQL_QUERY
38+
name="LI Complaints", carto_sql_queries=COMPLAINTS_SQL_QUERY
3139
)
3240

3341
# Filter for rows where 'subject' contains any of the keywords
3442
l_and_i_complaints.gdf = l_and_i_complaints.gdf[
35-
l_and_i_complaints.gdf["subject"].str.lower().str.contains('|'.join(keywords))
43+
l_and_i_complaints.gdf["subject"].str.lower().str.contains("|".join(keywords))
3644
]
3745

3846
# Filter for only Status = 'Open'
@@ -56,14 +64,15 @@ def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
5664

5765
# Load data for violations from L&I
5866
l_and_i_violations: FeatureLayer = FeatureLayer(
59-
name="LI Violations",
60-
carto_sql_queries=VIOLATIONS_SQL_QUERY,
61-
from_xy=True
67+
name="LI Violations", carto_sql_queries=VIOLATIONS_SQL_QUERY, from_xy=True
6268
)
6369

6470
# Filter for rows where 'casetype' contains any of the keywords, handling NaN values
6571
l_and_i_violations.gdf = l_and_i_violations.gdf[
66-
l_and_i_violations.gdf["violationcodetitle"].fillna('').str.lower().str.contains('|'.join(keywords))
72+
l_and_i_violations.gdf["violationcodetitle"]
73+
.fillna("")
74+
.str.lower()
75+
.str.contains("|".join(keywords))
6776
]
6877

6978
all_violations_count_df: pd.DataFrame = (
@@ -175,4 +184,4 @@ def remove_nan_strings(x: str) -> str | None:
175184
.astype(int)
176185
)
177186

178-
return primary_featurelayer
187+
return primary_featurelayer

data/src/data_utils/nbhoods.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,19 @@
77

88
def nbhoods(primary_featurelayer):
99
phl_nbhoods = gpd.read_file(NBHOODS_URL)
10-
10+
1111
# Correct the column name to uppercase if needed
12-
if 'MAPNAME' in phl_nbhoods.columns:
12+
if "MAPNAME" in phl_nbhoods.columns:
1313
phl_nbhoods.rename(columns={"MAPNAME": "neighborhood"}, inplace=True)
14-
14+
1515
phl_nbhoods = phl_nbhoods.to_crs(USE_CRS)
16-
16+
1717
nbhoods = FeatureLayer("Neighborhoods")
1818
nbhoods.gdf = phl_nbhoods
19-
19+
2020
red_cols_to_keep = ["neighborhood", "geometry"]
2121
nbhoods.gdf = nbhoods.gdf[red_cols_to_keep]
22-
22+
2323
primary_featurelayer.spatial_join(nbhoods)
24-
24+
2525
return primary_featurelayer

data/src/data_utils/negligent_devs.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -62,23 +62,29 @@ def negligent_devs(primary_featurelayer):
6262
print("Columns in 'devs' DataFrame:", devs.columns)
6363

6464
print("Initial properties data:")
65-
print(devs[['opa_id', 'city_owner_agency', 'mailing_street']].head(10))
65+
print(devs[["opa_id", "city_owner_agency", "mailing_street"]].head(10))
6666

67-
city_owners = devs.loc[~devs["city_owner_agency"].isna() & (devs["city_owner_agency"] != "")].copy()
68-
non_city_owners = devs.loc[devs["city_owner_agency"].isna() | (devs["city_owner_agency"] == "")].copy()
67+
city_owners = devs.loc[
68+
~devs["city_owner_agency"].isna() & (devs["city_owner_agency"] != "")
69+
].copy()
70+
non_city_owners = devs.loc[
71+
devs["city_owner_agency"].isna() | (devs["city_owner_agency"] == "")
72+
].copy()
6973

70-
print(f"City owners shape: {city_owners.shape}, Non-city owners shape: {non_city_owners.shape}")
74+
print(
75+
f"City owners shape: {city_owners.shape}, Non-city owners shape: {non_city_owners.shape}"
76+
)
7177

7278
# Log before standardizing addresses
7379
print("Non-city owners mailing streets before standardization:")
74-
print(non_city_owners[['opa_id', 'mailing_street']].head(10))
80+
print(non_city_owners[["opa_id", "mailing_street"]].head(10))
7581

7682
non_city_owners.loc[:, "mailing_street"] = (
7783
non_city_owners["mailing_street"].astype(str).apply(standardize_street)
7884
)
7985

8086
print("Non-city owners mailing streets after standardization:")
81-
print(non_city_owners[['opa_id', 'mailing_street']].head(10))
87+
print(non_city_owners[["opa_id", "mailing_street"]].head(10))
8288

8389
for term in ["ST", "AVE", "RD", "BLVD"]:
8490
non_city_owners.loc[:, "mailing_street"] = non_city_owners[
@@ -87,7 +93,7 @@ def negligent_devs(primary_featurelayer):
8793

8894
# Log after applying term replacement
8995
print("Non-city owners mailing streets after term replacement:")
90-
print(non_city_owners[['opa_id', 'mailing_street']].head(10))
96+
print(non_city_owners[["opa_id", "mailing_street"]].head(10))
9197

9298
# Fill missing address components
9399
non_city_owners.loc[:, "mailing_address_1"] = non_city_owners[
@@ -106,7 +112,11 @@ def negligent_devs(primary_featurelayer):
106112

107113
# Log addresses before creating standardized address
108114
print("Non-city owners mailing details before creating standardized address:")
109-
print(non_city_owners[['opa_id', 'mailing_street', 'mailing_city_state', 'mailing_zip']].head(10))
115+
print(
116+
non_city_owners[
117+
["opa_id", "mailing_street", "mailing_city_state", "mailing_zip"]
118+
].head(10)
119+
)
110120

111121
non_city_owners.loc[:, "standardized_address"] = non_city_owners.apply(
112122
create_standardized_address, axis=1
@@ -145,10 +155,10 @@ def negligent_devs(primary_featurelayer):
145155
)
146156

147157
devs_combined = pd.concat([city_owners, non_city_owners], axis=0)
148-
158+
149159
# Final check on the merged data before updating primary_featurelayer
150160
print("Combined data with property counts:")
151-
print(devs_combined[['opa_id', 'property_count']].head(10))
161+
print(devs_combined[["opa_id", "property_count"]].head(10))
152162

153163
primary_featurelayer.gdf = primary_featurelayer.gdf.merge(
154164
devs_combined[["opa_id", "property_count"]], on="opa_id", how="left"
@@ -158,9 +168,16 @@ def negligent_devs(primary_featurelayer):
158168
)
159169
primary_featurelayer.gdf.loc[:, "negligent_dev"] = (
160170
primary_featurelayer.gdf["n_properties_owned"] > 5
161-
) & (primary_featurelayer.gdf["city_owner_agency"].isna() | (primary_featurelayer.gdf["city_owner_agency"] == ""))
171+
) & (
172+
primary_featurelayer.gdf["city_owner_agency"].isna()
173+
| (primary_featurelayer.gdf["city_owner_agency"] == "")
174+
)
162175

163176
print("Final feature layer data with negligent_dev flag:")
164-
print(primary_featurelayer.gdf[['opa_id', 'n_properties_owned', 'negligent_dev']].head(10))
177+
print(
178+
primary_featurelayer.gdf[
179+
["opa_id", "n_properties_owned", "negligent_dev"]
180+
].head(10)
181+
)
165182

166183
return primary_featurelayer

0 commit comments

Comments
 (0)