Skip to content

Commit f93476b

Browse files
authored
Merge pull request #1252 from CodeForPhilly/staging
Weekly PR from Staging to Main
2 parents 26f24c4 + 5a23066 commit f93476b

31 files changed

+1607
-503
lines changed

.pre-commit-config.yaml

Lines changed: 24 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,54 @@
11
repos:
22
# Python hooks
3-
- repo: local
3+
- repo: https://github.com/astral-sh/ruff-pre-commit
4+
# Ruff version.
5+
rev: v0.12.0
46
hooks:
5-
# Ruff for Python linting
7+
# Run the linter.
68
- id: ruff
7-
name: ruff (Python linting)
8-
entry: ruff check --fix
9-
language: python
10-
types_or: [python, pyi, jupyter]
11-
files: \.py$
12-
13-
# Ruff for Python formatting
9+
args: [--fix]
10+
# Run the formatter.
1411
- id: ruff-format
15-
name: ruff-format (Python formatting)
16-
entry: ruff format
17-
language: python
18-
types_or: [python, pyi, jupyter]
19-
files: \.py$
20-
12+
- repo: local
13+
hooks:
2114
# Radon MI and Halstead checks
2215
- id: radon-check
2316
name: Radon Maintainability and Halstead Checks
2417
entry: |
25-
bash -c 'radon mi -s {} | grep -E " - [CDEF] \(" && echo "Radon MI grade below B detected!" && exit 1 || echo "All MI grades are B or above."
26-
radon hal {} -s | awk "/effort/ { if (\$3 > 2000) print FILENAME \": High Halstead effort:\" \$3 }"'
18+
bash -c 'for file in "$@"; do
19+
echo "Checking $file";
20+
radon mi -s "$file" | grep -E " - [CDEF] \(" && echo "Radon MI grade below B detected in $file!" && exit 1;
21+
radon hal "$file" -s | awk -v filename="$file" "/effort/ { if (\$3 > 2000) print filename \": High Halstead effort:\" \$3 }";
22+
done;
23+
echo "All MI grades are B or above."'
2724
language: system
2825
files: \.py$
29-
26+
pass_filenames: true
3027
- repo: https://github.com/jendrikseipp/vulture
31-
rev: 'v2.3' # or any later Vulture version
28+
rev: 'v2.3'
3229
hooks:
3330
- id: vulture
3431
name: vulture (Dead code detection)
35-
entry: vulture data/
36-
language: python
37-
types: [python]
38-
files: ^data/
39-
32+
args: [--min-confidence, '80', data/src]
4033
# JavaScript hooks
4134
- repo: local
4235
hooks:
43-
# Formatting with Prettier
36+
# Formatting with Prettier - run directly on files
4437
- id: prettier
4538
name: Prettier (Code formatting)
46-
entry: npm run format
39+
entry: npx prettier --write
4740
language: node
48-
files: \.(js|jsx|ts|tsx)$
49-
50-
# Linting with ESLint
41+
files: \.(js|jsx|ts|tsx|json|css|scss|md|yml|yaml)$
42+
# Linting with ESLint - run directly on files
5143
- id: eslint
5244
name: ESLint (JavaScript linting)
53-
entry: npm run lint
45+
entry: npx eslint --fix
5446
language: node
5547
files: \.(js|jsx|ts|tsx)$
56-
5748
# Dependency checks for Node.js
5849
- id: npm-audit
5950
name: Check Node.js dependencies
6051
entry: npm audit
6152
language: node
53+
files: ^package(-lock)?\.json$
54+
pass_filenames: false

data/src/classes/data_diff.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ def generate_diff(self):
3535
print(
3636
f"Table {self.table_name} has less than two separate files with different timestamps. Cannot perform comparison"
3737
)
38+
self.summary_text = f"Table {self.table_name} has less than two separate files with different timestamps. Cannot perform comparison"
39+
return self
3840

3941
def extract_date(str) -> datetime:
4042
pattern = "\b\d{4}_\d{1,2}_\d{1,2}\b"

data/src/classes/file_manager.py

Lines changed: 76 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import glob
12
import os
3+
import time
24
import zipfile
35
from datetime import datetime
46
from enum import Enum
@@ -118,12 +120,30 @@ def check_source_cache_file_exists(
118120
table_name (str): The name of the table of source data.
119121
load_type (LoadType): The destination type of the file (either SOURCE_CACHE or PIPELINE_CACHE).
120122
"""
123+
start_time = time.time()
124+
print(
125+
f" FileManager.check_source_cache_file_exists: Checking for {table_name}"
126+
)
127+
121128
directory = (
122129
self.source_cache_directory
123130
if load_type == LoadType.SOURCE_CACHE
124131
else self.pipeline_cache_directory
125132
)
126-
return len([file for file in os.listdir(directory) if table_name in file]) > 0
133+
# Use glob pattern matching for more efficient file searching
134+
pattern = os.path.join(directory, f"*{table_name}*.parquet")
135+
136+
glob_start = time.time()
137+
files = glob.glob(pattern)
138+
glob_time = time.time() - glob_start
139+
140+
result = len(files) > 0
141+
total_time = time.time() - start_time
142+
143+
print(
144+
f" FileManager.check_source_cache_file_exists: Found {len(files)} files in {glob_time:.2f}s (total: {total_time:.2f}s)"
145+
)
146+
return result
127147

128148
def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
129149
"""
@@ -134,25 +154,45 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
134154
GeoDataFrame: The dataframe loaded from the most recent cached file.
135155
None: If no files exist for the given table name.
136156
"""
137-
cached_files = [
138-
file
139-
for file in os.listdir(self.source_cache_directory)
140-
if table_name in file
141-
]
157+
start_time = time.time()
158+
print(
159+
f" FileManager.get_most_recent_cache: Loading most recent cache for {table_name}"
160+
)
161+
162+
# Use glob pattern matching for more efficient file searching
163+
pattern = os.path.join(self.source_cache_directory, f"*{table_name}*.parquet")
164+
165+
glob_start = time.time()
166+
cached_files = glob.glob(pattern)
167+
glob_time = time.time() - glob_start
142168

143169
if not cached_files:
170+
print(" FileManager.get_most_recent_cache: No cached files found")
144171
return None
145172

146-
cached_files.sort(
147-
key=lambda x: os.path.getmtime(
148-
os.path.join(self.source_cache_directory, x)
149-
),
150-
reverse=True,
173+
# Get the most recent file by modification time
174+
mtime_start = time.time()
175+
most_recent_file = max(cached_files, key=os.path.getmtime)
176+
mtime_time = time.time() - mtime_start
177+
178+
print(
179+
f" FileManager.get_most_recent_cache: Found {len(cached_files)} files, most recent: {os.path.basename(most_recent_file)}"
180+
)
181+
print(
182+
f" FileManager.get_most_recent_cache: Glob took {glob_time:.2f}s, mtime check took {mtime_time:.2f}s"
183+
)
184+
185+
# Load the parquet file
186+
load_start = time.time()
187+
gdf = gpd.read_parquet(most_recent_file)
188+
load_time = time.time() - load_start
189+
190+
total_time = time.time() - start_time
191+
print(
192+
f" FileManager.get_most_recent_cache: Parquet load took {load_time:.2f}s (total: {total_time:.2f}s)"
151193
)
152-
most_recent_file = cached_files[0]
153-
file_path = self.get_file_path(most_recent_file, LoadType.SOURCE_CACHE)
154194

155-
return gpd.read_parquet(file_path)
195+
return gdf
156196

157197
def load_gdf(
158198
self, file_name: str, load_type: LoadType, file_type: FileType | None = None
@@ -194,16 +234,38 @@ def save_gdf(
194234
file_type (FileType): The type of the file (GEOJSON or PARQUET).
195235
load_type (LoadType): The destination type of the file (TEMP or CACHE).
196236
"""
237+
start_time = time.time()
238+
print(f" FileManager.save_gdf: Starting save for {file_name}")
239+
197240
file_path = self.get_file_path(file_name, load_type, file_type)
241+
print(f" FileManager.save_gdf: Target path: {file_path}")
242+
198243
if file_type == FileType.PARQUET:
244+
print(
245+
f" FileManager.save_gdf: Writing parquet file ({len(gdf)} rows, {len(gdf.columns)} columns)"
246+
)
247+
parquet_start = time.time()
199248
gdf.to_parquet(file_path, index=False)
249+
parquet_time = time.time() - parquet_start
250+
print(f" FileManager.save_gdf: Parquet write took {parquet_time:.2f}s")
200251
elif file_type == FileType.GEOJSON:
252+
print(" FileManager.save_gdf: Writing GeoJSON file")
253+
geojson_start = time.time()
201254
gdf.to_file(file_path, driver="GeoJSON")
255+
geojson_time = time.time() - geojson_start
256+
print(f" FileManager.save_gdf: GeoJSON write took {geojson_time:.2f}s")
202257
elif file_type == FileType.CSV:
258+
print(" FileManager.save_gdf: Writing CSV file")
259+
csv_start = time.time()
203260
gdf.to_csv(file_path)
261+
csv_time = time.time() - csv_start
262+
print(f" FileManager.save_gdf: CSV write took {csv_time:.2f}s")
204263
else:
205264
raise ValueError(f"Unsupported file type: {file_type}")
206265

266+
total_time = time.time() - start_time
267+
print(f" FileManager.save_gdf: Total save operation took {total_time:.2f}s")
268+
207269
def save_fractional_gdf(
208270
self,
209271
gdf: gpd.GeoDataFrame,

0 commit comments

Comments
 (0)