Merge pull request #41 from johnseekins/additional-stats Some additional statistics in the data model

HongPong · web-flow · commit 99c385dcb82a · 2025-09-22T15:47:10.000-04:00
Some additional statistics in the data model
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -5,3 +5,12 @@ updates:
     directory: "/"
     schedule:
       interval: "weekly"
+
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+    open-pull-requests-limit: 1
+    groups:
+      actions:
+        dependency-type: "production"
diff --git a/.github/workflows/pre-commit-updates.yml b/.github/workflows/pre-commit-updates.yml
@@ -0,0 +1,50 @@
+# yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json
+name: Ensure Pre-commit services are updated (if possible)
+
+on:
+  workflow_dispatch: {}
+  schedule:
+    - cron: '0 0 1 * *' # Run on midnight on the first of the month
+
+permissions:
+  pull-requests: write
+  contents: write
+
+concurrency:
+  group: precommit-updates
+  cancel-in-progress: true
+
+jobs:
+  auto-update:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - id: file-check
+        run: |
+          files=$(find . -type f -name ".pre-commit-config.yaml" -print0 | xargs)
+          if [[ -s "${files}" ]]; then
+            echo "files_exist=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "files_exist=false" >> "$GITHUB_OUTPUT"
+          fi
+      # selfhosted nodes have precommit installed by default
+      - name: Run pre-commit autoupdate
+        if: steps.file-check.outputs.files_exist == 'true'
+        run: |
+          sudo apt-get install -yqq python3-pip python3-wheel
+          pip3 install -q --disable-pip-version-check pre-commit
+          pre-commit autoupdate
+      - name: Create Pull Request
+        if: steps.file-check.outputs.files_exist == 'true'
+        uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e
+        with:
+          token: ${{ github.token }}
+          branch: update/pre-commit-autoupdate
+          title: Auto-update pre-commit hooks
+          commit-message: ND - Auto-update pre-commit hooks
+          body: |
+            Update versions of tools in pre-commit
+            configs to latest version
+          labels: dependencies
+          delete-branch: true
+          sign-commits: true
diff --git a/default_data.py b/default_data.py
diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py
@@ -36,6 +36,13 @@
     "Last Final Rating",
 ]
 
+ice_inspection_types = {
+    # found in https://www.ice.gov/foia/odo-facility-inspections
+    "ODO": "Office of Detention Oversight",
+    # found in https://ia803100.us.archive.org/16/items/6213032-ORSA-MOU-ICE/6213032-ORSA-MOU-ICE_text.pdf
+    "ORSA": "Operational Review Self-Assessment",
+}
+
 # extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx 2025-09-07
 ice_facility_types = {
     "BOP": {
diff --git a/ice_scrapers/field_offices.py b/ice_scrapers/field_offices.py
@@ -152,7 +152,7 @@ def _extract_single_office(element: BeautifulSoup, page_url: str) -> dict:
         office["email"] = email[0]["href"].split(":", 1)[1]
     detail_txt = details.text  # type: ignore [union-attr]
     logger.debug("Detail text: %s", detail_txt)
-    aor_match = re.search(r"Area of Responsibility:(.+)Email", detail_txt)
+    aor_match = re.search(r"Area of Responsibility:(.+)\n?Email", detail_txt)
     if aor_match:
         office["aor"] = aor_match.group(1).strip().replace("\xa0", " ")
 
diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py
@@ -12,6 +12,7 @@
     clean_street,
     facility_sheet_header,
     ice_facility_types,
+    ice_inspection_types,
     repair_zip,
     repair_locality,
 )
@@ -99,10 +100,15 @@ def load_sheet(keep_sheet: bool = True) -> dict:
         details["address"]["postal_code"] = zcode
         details["address"]["street"] = street
         details["name"] = row["Name"]
+
+        # population statistics
         details["population"]["male"]["criminal"] = row["Male Crim"]
         details["population"]["male"]["non_criminal"] = row["Male Non-Crim"]
         details["population"]["female"]["criminal"] = row["Female Crim"]
         details["population"]["female"]["non_criminal"] = row["Female Non-Crim"]
+        details["population"]["total"] = (
+            row["Male Crim"] + row["Male Non-Crim"] + row["Female Crim"] + row["Female Non-Crim"]
+        )
         if row["Male/Female"]:
             if "/" in row["Male/Female"]:
                 details["population"]["female"]["allowed"] = True
@@ -117,6 +123,15 @@ def load_sheet(keep_sheet: bool = True) -> dict:
             "level_3": row["ICE Threat Level 3"],
             "none": row["No ICE Threat Level"],
         }
+        """
+        # extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx 2025-09-22
+        Upon admission and periodically thereafter, detainees are categorized into a security level based on a variety of public safety factors, and are housed accordingly.  Factors include prior convictions, threat risk, disciplinary record, special vulnerabilities, and special management concerns.  Detainees are categorized into one of four classes of security risk: A/low, B/medium low, C/medium high, and D/high.
+        """
+        details["population"]["security_threat"]["low"] = row["Level A"]
+        details["population"]["security_threat"]["medium_low"] = row["Level B"]
+        details["population"]["security_threat"]["medium_high"] = row["Level C"]
+        details["population"]["security_threat"]["high"] = row["Level D"]
+
         details["facility_type"] = {
             "id": row["Type Detailed"],
             "housing": {
@@ -130,6 +145,8 @@ def load_sheet(keep_sheet: bool = True) -> dict:
             details["facility_type"]["expanded_name"] = ft_details["expanded_name"]
         details["avg_stay_length"] = row["FY25 ALOS"]
         details["inspection"] = {
+            # fall back to type code
+            "last_type": ice_inspection_types.get(row["Last Inspection Type"], row["Last Inspection Type"]),
             "last_date": row["Last Inspection End Date"],
             "last_rating": row["Last Final Rating"],
         }
diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py
@@ -135,8 +135,10 @@ def repair_zip(zip_code: int, locality: str) -> Tuple[str, bool]:
     """
     zcode = str(zip_code)
     cleaned = False
-    if len(zcode) == 4:
-        zcode = f"0{zcode}"
+    if len(zcode) < 5:
+        # pad any prefix
+        zeros = "0" * (5 - len(zcode))
+        zcode = f"{zeros}{zcode}"
         return zcode, cleaned
     matches = [
         {"match": "89512", "replace": "89506", "locality": "Reno"},
diff --git a/schemas.py b/schemas.py
@@ -79,6 +79,13 @@
             "level_3": 0,
             "none": 0,
         },
+        "total": 0,
+        "security_threat": {
+            "low": 0,
+            "medium_low": 0,
+            "medium_high": 0,
+            "high": 0,
+        },
     },
     "facility_type": {
         "id": "",
@@ -90,6 +97,7 @@
         },
     },
     "inspection": {
+        "last_type": "",
         "last_date": None,
         "last_rating": "",
     },
diff --git a/utils.py b/utils.py
@@ -40,7 +40,7 @@ def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
     """flatten a nested dictionary for nicer printing to workbooks (excel/csv/etc.)"""
     items: list = []
     for k, v in d.items():
-        new_key = parent_key + sep + str(k) if parent_key else str(k)
+        new_key = f"{parent_key}{sep}{str(k)}" if parent_key else str(k)
         if isinstance(v, dict):
             items.extend(_flatdict(v, new_key, sep=sep).items())
         else: