Open-Security-Mapping-Project
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎enricher.py‎
Lines changed: 0 additions & 479 deletions b/‎enricher.py‎
Lines changed: 0 additions & 479 deletions
diff --git a/‎enrichers/README.md‎
Lines changed: 30 additions & 0 deletions b/‎enrichers/README.md‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎enrichers/__init__.py‎
Lines changed: 124 additions & 0 deletions b/‎enrichers/__init__.py‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎enrichers/general.py‎
Lines changed: 56 additions & 0 deletions b/‎enrichers/general.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎enrichers/openstreetmap.py‎
Lines changed: 106 additions & 0 deletions b/‎enrichers/openstreetmap.py‎
Lines changed: 106 additions & 0 deletions
@@ -14,8 +14,9 @@ which will help with documenting the facilities appropriately. As these entries
 your CSV results change almost immediately.
 
 You can also use `--load-existing` to leverage an existing
-scrape of the data from ICE.gov. This is stored in default_data.py and includes the official current addresses of facilities.
-(Note ICE has been renaming known "detention center" sites to "processing center", and so on.)
+scrape of the data from ICE.gov. This is stored in `default_data.py` and includes the official current addresses of facilities.
+
+> Note ICE has been renaming known "detention center" sites to "processing center", and so on.
 
 The initial scrape data also keeps a `base64` ecoded string containing the original HTML that was scraped from ice.gov about the
 facility. Keeping this initial data allows us to verify the resulting extracted data if we need to.
@@ -53,7 +54,7 @@ directory.
     uv run python main.py --load-existing --enrich --debug
 
     # With custom output file
-    uv run python main.py --load-existing --enrich --debug-wikipedia -o debug_facilities.csv
+    uv run python main.py --load-existing --enrich --debug-wikipedia -o debug_facilities
 ```
 
 ## Requirements
@@ -110,9 +111,8 @@ in hopes of finding similarly named pages but this is too aggressive, and it vee
 that have simpler names, like the county name instead of `county + detention center`). Use the debug mode to see what
 it is doing.
 * ICE scraping is not robustly tested. The image URL extraction needs some work. (should be able to get the detention center image URLs.)
-* OSM enrichment submits to OSM Nominatim API search with an extra comma between address number and street name.
 * The user-agent for running ice.gov scrape web requests calls itself `'User-Agent': 'ICE-Facilities-Research/1.0 (Educational Research Purpose)'`.
-You can change this in scraper.py and enricher.py.
+You can change this in `utils.py`.
 * It tells some pretty inaccurate percentages in the final summary - a lot of false positives, the Wikipedia debug percent
 seems wrong.
 * The remote query rate limiting is (I think) done in series but would go faster with parallel/async processing.
 
@@ -0,0 +1,30 @@
+# Facility enrichment scrapers
+
+These functions let us collect data about facilities from additional sources.
+
+## Enrichment class
+
+The base class we can build enrichment tools from. Largely ensures some consistent in functionality between enrichment tools.
+
+### Available functions
+
+Sub-classing `Enrichment` provides the following functions/objects:
+
+* `self.resp_info`
+    * Pre-created response object following our expected schema
+* `self._wait_time`
+    * simple rate-limiting through `time.sleep()` calls, `wait_time` tells us how long we should sleep between calls to an individual API/site.
+    * Defaults to `1` (seconds)
+* `self._req(...)`
+    * Wrapper function around a call to `requests.get` (using a properly configured `session` object)
+    * handles redirects
+    * supports most normal requests function calls (`params`, `timeout`, `stream`, custom headers)
+    * raises for non-2xx/3xx status
+    * returns the entire `requests.Response` object for manipulation
+* `_minimal_clean_facility_name(str)`
+    * standardizes facility name for searching
+* `_clean_facility_name(str)`
+    * standardizes facility name for searching
+    * more aggressive formatting than `_minimal_...` above
+
+> All child functions should implement the `search()` function, which should return a dictionary using the `enrich_resp_schema` schema.
@@ -0,0 +1,124 @@
+"""
+Import order here is a touch weird, but we need it so
+types exist before attempting to import functions that
+may call them
+"""
+
+import copy
+import requests
+from schemas import enrich_resp_schema
+import time
+from utils import (
+    default_headers,
+    session,
+)
+
+
+class Enrichment(object):
+    _required_keys = [
+        "facility_name",
+    ]
+    # in seconds
+    _wait_time: float = 1
+
+    def __init__(self, **kwargs):
+        self.resp_info = copy.deepcopy(enrich_resp_schema)
+        for k in self._required_keys:
+            if k not in kwargs.keys():
+                raise KeyError("Missing required key %s in %s", k, kwargs)
+        self.search_args = copy.deepcopy(kwargs)
+
+    def search(self) -> dict:
+        """Child objects should implement this"""
+        return {}
+
+    def _req(self, url: str, **kwargs) -> requests.Response:
+        """requests response wrapper to ensure we honor waits"""
+        headers = kwargs.get("headers", {})
+        # ensure we get all headers configured correctly
+        # but manually applied headers win the argument
+        for k, v in default_headers.items():
+            if k in headers.keys():
+                continue
+            headers[k] = v
+
+        response = session.get(
+            url,
+            allow_redirects=True,
+            timeout=kwargs.get("timeout", 10),
+            params=kwargs.get("params", {}),
+            stream=kwargs.get("stream", False),
+            headers=headers,
+        )
+        response.raise_for_status()
+        time.sleep(self._wait_time)
+        return response
+
+    def _minimal_clean_facility_name(self, name: str) -> str:
+        """Minimal cleaning that preserves important context like 'County Jail'"""
+        cleaned = name
+
+        # Remove pipe separators and take the main name
+        if "|" in cleaned:
+            parts = cleaned.split("|")
+            cleaned = max(parts, key=len).strip()
+
+        # Only remove very generic suffixes, keep specific ones like "County Jail"
+        generic_suffixes = [
+            "Service Processing Center",
+            "ICE Processing Center",
+            "Immigration Processing Center",
+            "Contract Detention Facility",
+            "Adult Detention Facility",
+        ]
+
+        for suffix in generic_suffixes:
+            if cleaned.endswith(suffix):
+                cleaned = cleaned[: -len(suffix)].strip()
+                break
+
+        return cleaned
+
+    def _clean_facility_name(self, name: str) -> str:
+        """Clean facility name for better search results"""
+        # Remove common suffixes and prefixes that might interfere with search
+        # This function may not be helpful - may be counterproductive.
+        cleaned = name
+
+        # Remove pipe separators and take the main name
+        if "|" in cleaned:
+            parts = cleaned.split("|")
+            # Take the longer part as it's likely the full name
+            cleaned = max(parts, key=len).strip()
+
+        # Remove common facility type suffixes for broader search
+        suffixes_to_remove = [
+            "Detention Center",
+            "Processing Center",
+            "Correctional Center",
+            "Correctional Facility",
+            "Detention Facility",
+            "Service Processing Center",
+            "ICE Processing Center",
+            "Immigration Processing Center",
+            "Adult Detention Facility",
+            "Contract Detention Facility",
+            "Regional Detention Center",
+            "County Jail",
+            "County Detention Center",
+            "Sheriff's Office",
+            "Justice Center",
+            "Safety Center",
+            "Jail Services",
+            "Correctional Complex",
+            "Public Safety Complex",
+        ]
+
+        for suffix in suffixes_to_remove:
+            if cleaned.endswith(suffix):
+                cleaned = cleaned[: -len(suffix)].strip()
+                break
+        return cleaned
+
+
+from .general import enrich_facility_data  # noqa: F401,E402
@@ -0,0 +1,56 @@
+from concurrent.futures import ProcessPoolExecutor
+import copy
+from enrichers import (
+    openstreetmap,
+    wikidata,
+    wikipedia,
+)
+from schemas import (
+    facilities_schema,
+)
+import time
+from utils import logger
+
+
+def enrich_facility_data(facilities_data: dict, workers: int = 3) -> dict:
+    """wrapper function for multiprocessing of facility enrichment"""
+    start_time = time.time()
+    logger.info("Starting data enrichment with external sources...")
+    enriched_data = copy.deepcopy(facilities_schema)
+    total = len(facilities_data["facilities"])
+    processed = 0
+
+    with ProcessPoolExecutor(max_workers=workers) as pool:
+        for res in pool.map(_enrich_facility, facilities_data["facilities"].items()):
+            enriched_data["facilities"][res[0]] = res[1]  # type: ignore [index]
+            processed += 1
+            logger.info("  -> Finished %s, %s/%s completed", res[1]["name"], processed, total)
+
+    logger.info("Data enrichment completed!")
+    enriched_data["enrich_runtime"] = time.time() - start_time
+    logger.info(" Completed in %s seconds", enriched_data["enrich_runtime"])
+    return enriched_data
+
+
+def _enrich_facility(facility_data: tuple) -> tuple:
+    """enrich a single facility"""
+    facility_id, facility = facility_data
+    facility_name = facility["name"]
+    logger.info("Enriching facility %s...", facility_name)
+    enriched_facility = copy.deepcopy(facility)
+
+    wiki_res = wikipedia.Wikipedia(facility_name=facility_name).search()
+    wd_res = wikidata.Wikidata(facility_name=facility_name).search()
+    osm = openstreetmap.OpenStreetMap(facility_name=facility_name, address=facility.get("address", {}))
+    osm_res = osm.search()
+    enriched_facility["wikipedia"]["page_url"] = wiki_res.get("url", "")
+    enriched_facility["wikipedia"]["search_query"] = wiki_res.get("search_query_steps", "")
+    enriched_facility["wikidata"]["page_url"] = wd_res.get("url", "")
+    enriched_facility["wikidata"]["search_query"] = wd_res.get("search_query_steps", "")
+    enriched_facility["osm"]["latitude"] = osm_res.get("details", {}).get("latitude", osm.default_coords["latitude"])
+    enriched_facility["osm"]["longitude"] = osm_res.get("details", {}).get("longitude", osm.default_coords["longitude"])
+    enriched_facility["osm"]["url"] = osm_res.get("url", "")
+    enriched_facility["osm"]["search_query"] = osm_res.get("search_query_steps", "")
+
+    logger.debug(enriched_facility)
+    return facility_id, enriched_facility
@@ -0,0 +1,106 @@
+from enrichers import Enrichment
+from utils import logger
+
+
+class OpenStreetMap(Enrichment):
+    # default to Washington, D.C.?
+    default_coords: dict = {
+        "latitude": 38.89511000,
+        "longitude": -77.03637000,
+    }
+
+    def search(self) -> dict:
+        facility_name = self.search_args["facility_name"]
+        address = self.search_args.get("address", {})
+        search_name = self._clean_facility_name(facility_name)
+        search_url = "https://nominatim.openstreetmap.org/search"
+        self.resp_info["enrichment_type"] = "openstreetmap"
+        data = []
+        if not address:
+            logger.debug("No address for %s, simply searching for name", facility_name)
+            params = {
+                "q": search_name,
+                "format": "json",
+                "limit": 5,
+                "dedupe": 1,
+            }
+            logger.debug("Searching OSM for %s", search_name)
+            self.resp_info["search_query_steps"].append(search_name)  # type: ignore [attr-defined]
+            try:
+                response = self._req(search_url, params=params, timeout=15)
+                logger.debug("Response: %s", response.text)
+                data = response.json()
+            except Exception as e:
+                logger.debug(" OSM search error for '%s': %s", facility_name, e)
+                self.resp_info["search_query_steps"].append(f"(Failed -> {e})")  # type: ignore [attr-defined]
+                return self.resp_info
+        else:
+            full_address = (
+                f"{address['street']} {address['locality']}, {address['administrative_area']} {address['postal_code']}"
+            )
+            locality = f"{address['locality']}, {address['administrative_area']} {address['postal_code']}"
+            search_url = "https://nominatim.openstreetmap.org/search"
+            search_params = {
+                "facility_name": {
+                    "q": f"{search_name} {full_address}",
+                    "format": "json",
+                    "limit": 5,
+                    "dedupe": 1,
+                },
+                "street_address": {
+                    "q": f"{full_address}",
+                    "format": "json",
+                    "limit": 5,
+                    "dedupe": 1,
+                },
+                "locality": {
+                    "q": f"{locality}",
+                    "format": "json",
+                    "limit": 5,
+                    "dedupe": 1,
+                },
+            }
+            for search_name, params in search_params.items():
+                logger.debug("Searching OSM for %s", params["q"])
+                self.resp_info["search_query_steps"].append(params["q"])  # type: ignore [attr-defined]
+                try:
+                    response = self._req(search_url, params=params, timeout=15)
+                    data = response.json()
+                except Exception as e:
+                    logger.debug(" OSM search error for '%s': %s", facility_name, e)
+                    self.resp_info["search_query_steps"].append(f"(Failed -> {e})")  # type: ignore [attr-defined]
+                    continue
+        if not data:
+            return self.resp_info
+        # when the URL result is a "way" this is usually correct.
+        # checks top five results.
+        match_terms = ["prison", "detention", "correctional", "jail"]
+        for result in data:
+            osm_type = result.get("type", "").lower()
+            lat = result.get("lat", self.default_coords["latitude"])
+            lon = result.get("lon", self.default_coords["longitude"])
+            display_name = result.get("display_name", "").lower()
+            if any(term in osm_type for term in match_terms) or any(term in display_name for term in match_terms):
+                # todo courthouse could be added, or other tags such as "prison:for=migrant" as a clear positive search result.
+                osm_id = result.get("osm_id", "")
+                osm_type_prefix = result.get("osm_type", "")
+                title = result.get("display_name", "")
+                if osm_id and osm_type_prefix:
+                    self.resp_info["url"] = f"https://www.openstreetmap.org/{osm_type_prefix}/{osm_id}"
+                    self.resp_info["details"]["latitude"] = lat  # type: ignore [index]
+                    self.resp_info["details"]["longitude"] = lon  # type: ignore [index]
+                    self.resp_info["title"] = title
+                    return self.resp_info
+        # fallback to first result
+        first_result = data[0]
+        logger.debug("Address searches didn't directly find anything, just using the first result: %s", first_result)
+        title = first_result.get("display_name", "")
+        lat = first_result.get("lat", self.default_coords["latitude"])
+        lon = first_result.get("lon", self.default_coords["longitude"])
+        self.resp_info["search_query_steps"].append(f"{lat}&{lon}")  # type: ignore [attr-defined]
+        if lat and lon:
+            self.resp_info["url"] = f"https://www.openstreetmap.org/?mlat={lat}&mlon={lon}&zoom=15"
+            self.resp_info["details"]["latitude"] = lat  # type: ignore [index]
+            self.resp_info["details"]["longitude"] = lon  # type: ignore [index]
+            self.resp_info["title"] = title
+        return self.resp_info