Open-Security-Mapping-Project
diff --git a/‎default_data.py‎
Lines changed: 340 additions & 20 deletions b/‎default_data.py‎
Lines changed: 340 additions & 20 deletions
diff --git a/‎field_offices.py‎
Lines changed: 179 additions & 0 deletions b/‎field_offices.py‎
Lines changed: 179 additions & 0 deletions
diff --git a/‎file_utils.py‎
Lines changed: 2 additions & 1 deletion b/‎file_utils.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎main.py‎
Lines changed: 8 additions & 2 deletions b/‎main.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎schemas.py‎
Lines changed: 58 additions & 7 deletions b/‎schemas.py‎
Lines changed: 58 additions & 7 deletions
@@ -0,0 +1,179 @@
+# ICEFieldOfficeScraper class and scraping-related code
+import base64
+from bs4 import BeautifulSoup
+import copy
+import datetime
+import re
+from schemas import (
+    field_offices_schema,
+    field_office_schema,
+)
+import time
+from utils import (
+    logger,
+    session,
+)
+
+
+class ICEFieldOfficeScraper(object):
+    base_scrape_url = "https://www.ice.gov/contact/field-offices"
+
+    def __init__(self):
+        self.office_data = copy.deepcopy(field_offices_schema)
+
+    def _get_scrape_pages(self) -> list:
+        """Discover all facility pages"""
+        resp = session.get(self.base_scrape_url, timeout=30)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.content, "html.parser")
+        links = soup.findAll("a", href=re.compile(r"\?page="))
+        if not links:
+            raise Exception(f"{self.base_scrape_url} contains *no* links?!")
+        pages = [
+            f"{self.base_scrape_url}{link['href']}&exposed_form_display=1"
+            for link in links
+            if not any(k in link["aria-label"] for k in ["Next", "Last"])
+        ]
+        logger.debug("Pages discovered: %s", pages)
+        return pages
+
+    def scrape_field_offices(self) -> dict:
+        """Collect data on ICE field offices"""
+        start_time = time.time()
+        self.office_data["scraped_date"] = datetime.datetime.now(datetime.UTC)
+        logger.info("Starting to scrape ICE.gov field offices...")
+        urls = self._get_scrape_pages()
+        for page_num, url in enumerate(urls):
+            logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
+            offices = self._scrape_page(url)
+            logger.debug("Found %s offices on page %s", len(offices), page_num + 1)
+            time.sleep(1)  # Be respectful to the server
+            for office in offices:
+                self.office_data["field_offices"][office["field_office"]] = office
+        self.office_data["scrape_runtime"] = time.time() - start_time
+        logger.info("Total field offices scraped: %s", len(self.office_data["field_offices"]))
+        logger.info(" Completed in %s seconds", self.office_data["scrape_runtime"])
+        return self.office_data
+
+    def _scrape_page(self, page_url: str) -> list:
+        """Scrape a single page of facilities using BeautifulSoup"""
+        logger.debug("  Fetching: %s", page_url)
+        try:
+            response = session.get(page_url, timeout=30)
+            response.raise_for_status()
+        except Exception as e:
+            logger.error("  Error parsing %s: %s", page_url, e)
+            return []
+        # Parse HTML with BeautifulSoup
+        soup = BeautifulSoup(response.content, "html.parser")
+        offices = []
+
+        # Look for the main content area - ICE uses different possible containers
+        content_selectors = [
+            "div.view-content",  # Primary content container
+            "div.views-rows",  # Alternative container
+            "ul.views-rows",  # List-based container
+            "div.region-content",  # Region content
+            "main",  # HTML5 main element
+            "div.content",  # Generic content
+        ]
+        content_container = None
+        logger.debug("Searching %s for content", page_url)
+        for selector in content_selectors:
+            content_container = soup.select_one(selector)
+            if content_container:
+                logger.debug("  Found content using selector: %s", selector)
+                break
+
+        if not content_container:
+            logger.warning("  Warning: Could not find content container, searching entire page")
+            content_container = soup
+
+        # Look for facility entries - try multiple patterns
+        office_selectors = [
+            "li.grid",  # List items with grid class
+            "div.views-row",  # View rows
+            "li.views-row",  # List-based view rows
+            "div.facility-item",  # Custom facility items
+            "article",  # Article elements
+            "div.node",  # Drupal node containers
+        ]
+        office_elements: list = []
+        for selector in office_selectors:
+            elements = content_container.select(selector)
+            if elements:
+                office_elements = elements
+                logger.debug(
+                    "  Found %s office elements using selector: %s",
+                    len(elements),
+                    selector,
+                )
+                break
+
+        # if not office_elements:
+        #     # Fallback: look for any element containing office-like text patterns
+        #     logger.warning("  Using fallback: searching for office patterns in text")
+        #     office_elements = self._find_office_patterns(content_container)
+
+        # Extract data from each office element
+        for element in office_elements:
+            office_data = self._extract_single_office(element, page_url)
+            if office_data and office_data.get("name", None):
+                offices.append(office_data)
+        logger.info("  Extracted %s field offices from page", len(offices))
+
+        return offices
+
+    def _extract_single_office(self, element: BeautifulSoup, page_url: str) -> dict:
+        """Extract data from a single office element"""
+        office = copy.deepcopy(field_office_schema)
+        raw_scrape = str(element)
+        office["raw_scrape"] = base64.b64encode(raw_scrape.encode("utf-8")).decode("utf-8")
+        office["source_urls"].append(page_url)
+        logger.debug("Trying to get office data from %s", element)
+        office_name = element.select_one(".views-field-field-field-office-location")
+        if not office_name or not office_name.text.strip().endswith("ERO"):
+            logger.debug("  Skipping %s because it is not an ERO location", office_name.text)  # type: ignore [union-attr]
+            # not a field office
+            return {}
+        office["name"] = office_name.text.strip()
+        field_office = element.select_one(".views-field-title")
+        if field_office:
+            office["field_office"] = field_office.text.strip()
+        address = element.select_one(".address-line1")
+        if address:
+            office["address"]["street"] = address.text.strip()
+        # optional line 2 of address
+        address = element.select_one(".address-line2")
+        if address:
+            office["address"]["street"] = f"{office['address']['street']} {address.text.strip()}"
+        locality = element.select_one(".locality")
+        if locality:
+            office["address"]["locality"] = locality.text.strip()
+        administrative_area = element.select_one(".administrative-area")
+        if administrative_area:
+            office["address"]["administrative_area"] = administrative_area.text.strip()
+        postal_code = element.select_one(".postal-code")
+        if postal_code:
+            office["address"]["postal_code"] = postal_code.text.strip()
+        office["address_str"] = (
+            f"{office['address']['street']} {office['address']['locality']}, {office['address']['administrative_area']} {office['address']['postal_code']}"
+        )
+        country = element.select_one(".country")
+        if country:
+            office["address"]["country"] = country.text.strip()
+        phone = element.select_one(".ct-addr")
+        if phone:
+            office["phone"] = phone.text.strip()
+        details = element.select_one(".views-field-body")
+        email = details.findAll("a")  # type: ignore [union-attr]
+        if email:
+            office["email"] = email[0]["href"].split(":", 1)[1]
+        detail_txt = details.text  # type: ignore [union-attr]
+        logger.debug("Detail text: %s", detail_txt)
+        aor_match = re.match(r"Area of Responsibility:(.+)\n?Email", detail_txt)
+        if aor_match:
+            office["aor"] = aor_match.group(1).strip().replace("\xa0", " ")
+
+        logger.debug("Returning %s", office)
+        return office
@@ -58,7 +58,8 @@ def print_summary(facilities_data: dict) -> None:
     # Count by field office
     field_offices: dict = {}
     for facility_id, facility in facilities_data["facilities"].items():
-        field_offices[facility["field_office"]] = field_offices.get(facility["field_office"], 0) + 1
+        office = facility.get("field_office", {}).get("field_office", "Unknown")
+        field_offices[office] = field_offices.get(office, 0) + 1
 
     logger.info("\nFacilities by Field Office:")
     for office, count in sorted(field_offices.items(), key=lambda x: x[1], reverse=True):
 
@@ -26,6 +26,7 @@
 from enricher import ExternalDataEnricher
 from schemas import supported_output_types
 from scraper import ICEGovFacilityScraper
+from field_offices import ICEFieldOfficeScraper
 from utils import logger
 # CLI, argument parsing, script orchestration
 
@@ -113,11 +114,16 @@ def main() -> None:
         exit(1)
 
     if args.scrape:
-        scraper = ICEGovFacilityScraper()
+        fo_scraper = ICEFieldOfficeScraper()
+        field_offices = fo_scraper.scrape_field_offices()
+        scraper = ICEGovFacilityScraper(field_offices)
         facilities_data = scraper.scrape_facilities()
     elif args.load_existing:
         facilities_data = copy.deepcopy(default_data.facilities_data)
-        logger.info("Loaded %s existing facilities from local data. (Not scraping)", len(facilities_data["facilities"]))
+        logger.info(
+            "Loaded %s existing facilities from local data. (Not scraping)",
+            len(facilities_data["facilities"].keys()),  # type: ignore [attr-defined]
+        )
 
     if args.enrich:
         if not facilities_data:
 
@@ -1,3 +1,4 @@
+import copy
 import datetime
 
 # default to Washington, D.C.?
@@ -13,6 +14,29 @@
     "facilities": {},
 }
 
+field_offices_schema: dict = {
+    "field_offices": {},
+    "scraped_date": datetime.datetime.now(datetime.UTC),
+    "scrape_runtime": 0,
+}
+
+field_office_schema: dict = {
+    "name": "",
+    "field_office": "",
+    "address_str": "",
+    "address": {
+        "administrative_area": "",
+        "country": "",
+        "locality": "",
+        "postal_code": "",
+        "street": "",
+    },
+    "aor": "",
+    "email": "",
+    "raw_scrape": "",
+    "source_urls": [],
+}
+
 # default keys to "false"-y values so we can merge easier
 facility_schema: dict = {
     "address": {
@@ -24,7 +48,7 @@
     },
     "address_str": "",
     "_repaired_record": False,
-    "field_office": "",
+    "field_office": copy.deepcopy(field_office_schema),
     "image_url": "",
     "name": "",
     "phone": "",
@@ -101,6 +125,7 @@
         "expanded_name": "United States Marshals Service",
         "description": "A facility primarily contracted with the USMS for housing of USMS detainees, in which ICE contracts with the USMS for bed space.",
     },
+    # two keys for the same thing as it isn't consistently defined
     "USMSIGA": {
         "expanded_name": "United States Marshal Service Intergovernmental Agreement",
         "description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
@@ -110,15 +135,43 @@
         "description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
     },
     "USMS CDF": {
-        "expanded_name": "United States Marshal Service Central Detention Facility",
-        "description": "Name guessed at from searching",
+        "expanded_name": "United States Marshal Service Contract Detention Facility",
+        "description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
     },
     "CDF": {
-        "expanded_name": "Central Detention Facility",
-        "description": "Name guessed at from searching",
+        "expanded_name": "Contract Detention Facility",
+        "description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
     },
 }
 
+# ICE AOR mappings
+area_of_responsibility = {
+    "ATL": "Atlanta Field Office",
+    "BOS": "Boston Field Office",
+    "BUF": "Buffalo Field Office",
+    "CHI": "Chicago Field Office",
+    "DAL": "Dallas Field Office",
+    "DEN": "Denver Field Office",
+    "DET": "Detroit Field Office",
+    "ELP": "El Paso Field Office",
+    "HLG": "Harlingen Field Office",
+    "HOU": "Houston Field Office",
+    "LOS": "Los Angeles Field Office",
+    "MIA": "Miami Field Office",
+    "NEW": "Newark Field Office",
+    "NOL": "New Orleans Field Office",
+    "NYC": "New York City Field Office",
+    "PHI": "Philadelphia Field Office",
+    "PHO": "Phoenix Field Office",
+    "SEA": "Seattle Field Office",
+    "SFR": "San Francisco Field Office",
+    "SLC": "Salt Lake City Field Office",
+    "SNA": "San Antonio Field Office",
+    "SND": "San Diego Field Office",
+    "SPM": "St Paul Field Office",
+    "WAS": "Washington Field Office",
+}
+
 # enrichment response object
 resp_info_schema = {
     "original_name": "",
@@ -136,6 +189,4 @@
     "osm_found": 0,
 }
 
-default_field_office = "(Possibly) Not managed by DHS field office"
-
 supported_output_types = ["csv", "json", "xlsx", "parquet"]