Skip to content

Commit 65e7c30

Browse files
committed
track runtimes
Signed-off-by: John Seekins <[email protected]>
1 parent 82b7057 commit 65e7c30

File tree

3 files changed

+12
-9
lines changed

3 files changed

+12
-9
lines changed

enricher.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import time
55
from urllib.parse import quote
66
import urllib3
7-
from utils import logger
7+
from utils import logger, facilities_schema
88

99
# ExternalDataEnricher class for enrichment logic
1010

@@ -27,14 +27,14 @@ def __init__(self):
2727
self.session.headers.update({"User-Agent": "ICE-Facilities-Research/1.0 (Educational Research Purpose)"})
2828

2929
def enrich_facility_data(self, facilities_data: dict) -> dict:
30+
start_time = time.time()
3031
logger.info("Starting data enrichment with external sources...")
31-
enriched_data = copy.deepcopy(facilities_data)
32-
enriched_data["facilities"] = []
32+
enriched_data = copy.deepcopy(facilities_schema)
3333
total = len(facilities_data["facilities"])
3434

3535
for i, facility in enumerate(facilities_data["facilities"]):
3636
logger.info("Processing facility %s/%s: %s...", i + 1, total, facility["name"])
37-
enriched_facility = facility.copy()
37+
enriched_facility = copy.deepcopy(facility)
3838
base_enrichment = {
3939
"wikipedia_page_url": "",
4040
"wikipedia_search_query": "",
@@ -81,13 +81,11 @@ def enrich_facility_data(self, facilities_data: dict) -> dict:
8181
enriched_facility["osm_result_url"] = ""
8282
enriched_facility["osm_search_query"] = str(e)
8383

84-
enriched_data["facilities"].append(enriched_facility)
85-
86-
# do we need the "progress bar" if we show the count in the beginning message?
87-
# if (i + 1) % 10 == 0:
88-
# logger.info(" Progress: %s/%s facilities processed", i + 1, total)
84+
enriched_data["facilities"].append(enriched_facility) # type: ignore [attr-defined]
8985

9086
logger.info("Data enrichment completed!")
87+
enriched_data["enrich_runtime"] = time.time() - start_time
88+
logger.info(" Completed in %s seconds", enriched_data["enrich_runtime"])
9189
return enriched_data
9290

9391
def _search_wikipedia(self, facility_name: str) -> dict:

scraper.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(self):
3333

3434
def scrape_facilities(self):
3535
"""Scrape all ICE detention facility data from all 6 pages"""
36+
start_time = time.time()
3637
logger.info("Starting to scrape ICE detention facilities...")
3738

3839
self.facilities_data["scraped_date"] = datetime.datetime.utcnow()
@@ -51,7 +52,9 @@ def scrape_facilities(self):
5152
time.sleep(1) # Be respectful to the server
5253

5354
# self.facilities_data = all_facilities
55+
self.facilities_data["scrape_runtime"] = time.time() - start_time
5456
logger.info("Total facilities scraped: %s", len(self.facilities_data["facilities"]))
57+
logger.info(" Completed in %s seconds", self.facilities_data["scrape_runtime"])
5558
return self.facilities_data
5659

5760
def _scrape_updated(self, url: str):

utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
facilities_schema = {
1010
"scraped_date": datetime.datetime.utcnow(),
1111
"page_updated_date": datetime.datetime.utcnow(),
12+
"scrape_runtime": 0,
13+
"enrich_runtime": 0,
1214
"facilities": [],
1315
}
1416

0 commit comments

Comments
 (0)