[HE] Add geolocation support from OpenStreetMap iframes (#220)

tifa365 · tim · web-flow · commit 0b019ff31d70 · 2025-11-26T21:18:46.000+01:00
* [HE] Add geolocation support from OpenStreetMap iframes

Extract coordinates from OSM iframes and links on school detail pages
using standard library parsing (no new dependencies). Currently achieves 90.7%
coverage (1,863/2,054 schools).

Co-authored-by: tim &lt;tfangmeyer@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ When available, we try to use the geolocations provided by the data publishers.
 | BB    | ✅ Yes                | WFS                                          |
 | HB    | ❌ No                 | -                                            |
 | HH    | ✅ Yes                | WFS                                          |
-| HE    | ❌ No                 | -                                            |
+| HE    | ⚠️  Partial (~90%)    | Extracted from OSM on detail pages. The schools without coordinates are schools with placeholder coordinates that are filtered out and schools with no map data at all. |
 | MV    | ✅ Yes                | WFS                                          |
 | NI    | ❌ No                 | -                                            |
 | NW    | ✅ Yes                | Converted from EPSG:25832 in source CSV data |
diff --git a/jedeschule/spiders/hessen.py b/jedeschule/spiders/hessen.py
@@ -1,5 +1,6 @@
 import scrapy
 import re
+from urllib.parse import urlparse, parse_qs
 
 from scrapy import Item
 
@@ -8,15 +9,24 @@
 
 
 class HessenSpider(SchoolSpider):
+    """Spider for scraping school data from Hessen's school database
+
+    Extracts school information by:
+    1. Submitting search forms for each school type
+    2. Parsing result lists for school detail page links
+    3. Extracting contact info and coordinates from detail pages
+    """
     name = "hessen"
 
     start_urls = ["https://schul-db.bildung.hessen.de/schul_db.html"]
 
     def parse(self, response):
+        # Extract all available school types from the dropdown
         school_types = response.xpath(
             '//select[@id="id_school_type"]/option/@value'
         ).extract()
 
+        # Build search form with empty filters to get all schools per type
         form = {
             "school_name": "",
             "school_town": "",
@@ -28,6 +38,7 @@ def parse(self, response):
             "submit_hesse": "Hessische+Schule+suchen+...",
         }
 
+        # Submit one search per school type to retrieve all schools
         for school_type in school_types:
             form["school_type"] = school_type
 
@@ -36,28 +47,48 @@ def parse(self, response):
             )
 
     def parse_list(self, response):
+        # Extract links to individual school detail pages
         schools = response.xpath("//tbody/tr/td/a/@href").extract()
 
         for school in schools:
             yield scrapy.Request(school, callback=self.parse_details)
 
+    def _extract_coords_from_osm_url(self, url: str) -> tuple[float, float] | tuple[None, None]:
+        """Extract coordinates from OpenStreetMap iframe URL marker parameter"""
+        qs = parse_qs(urlparse(url).query)
+
+        # Extract marker parameter (format: "latitude,longitude")
+        if "marker" in qs and qs["marker"]:
+            try:
+                lat_str, lon_str = qs["marker"][0].split(",", 1)
+                return float(lat_str), float(lon_str)
+            except (ValueError, IndexError):
+                pass
+
+        return None, None
+
     def parse_details(self, response):
+        # Extract basic school info from <pre> text blocks
         contact_text_nodes = response.xpath("//pre/text()").extract()
         adress = contact_text_nodes[0].split("\n")
 
+        # Parse ZIP and city from line 4 (format: "12345 City Name")
         matches = re.search(r"(\d+) (.+)", adress[3])
 
+        # Build school dict with required fields
         school = {
             "name": adress[1],
             "straße": adress[2],
             "ort": matches.group(2),
             "plz": matches.group(1),
         }
 
+        # Extract optional fax number if present
         for text_node in contact_text_nodes:
             if "Fax: " in text_node:
                 school["fax"] = text_node.split("\n")[1].replace("Fax: ", "").strip()
 
+        # Extract phone and website from links
         contact_links = response.xpath("//pre/a/@href").extract()
         for link in contact_links:
             if "tel:" in link:
@@ -66,18 +97,36 @@ def parse_details(self, response):
             if "http" in link:
                 school["homepage"] = link
 
+        # Extract school type from main content area
         school["schultyp"] = (
             response.xpath('//main//div[@class="col-md-9 col-lg-9"]/text()')
             .extract_first()
             .replace("\n", "")
             .strip()
         )
+        # Extract school ID from URL query parameter
         school["id"] = response.request.url.split("=")[-1]
 
+        # Extract coordinates from OpenStreetMap iframe
+        latitude, longitude = None, None
+        iframe_src = response.xpath('//iframe[contains(@src, "openstreetmap.org")]/@src').get()
+        if iframe_src:
+            latitude, longitude = self._extract_coords_from_osm_url(iframe_src)
+
+        # Filter out placeholder coordinates (-1.0, -1.0) used by Hessen DB for missing data
+        # Example: https://schul-db.bildung.hessen.de/schul_db.html/details/?school_no=9642
+        if latitude == -1.0 and longitude == -1.0:
+            latitude = None
+            longitude = None
+
+        school["latitude"] = latitude
+        school["longitude"] = longitude
+
         yield school
 
     @staticmethod
     def normalize(item: Item) -> School:
+        """Transform raw scraped data into standardized School model"""
         return School(
             name=item.get("name"),
             phone=item.get("telefon"),
@@ -87,5 +136,7 @@ def normalize(item: Item) -> School:
             city=item.get("ort"),
             zip=item.get("plz"),
             school_type=item.get("schultyp"),
-            id="HE-{}".format(item.get("id")),
+            id="HE-{}".format(item.get("id")),  # Prefix with state code
+            latitude=item.get("latitude"),
+            longitude=item.get("longitude"),
         )