Skip to content

Commit 0b019ff

Browse files
tifa365tim
andauthored
[HE] Add geolocation support from OpenStreetMap iframes (#220)
* [HE] Add geolocation support from OpenStreetMap iframes Extract coordinates from OSM iframes and links on school detail pages using standard library parsing (no new dependencies). Currently achieves 90.7% coverage (1,863/2,054 schools). Co-authored-by: tim <[email protected]>
1 parent cff30cf commit 0b019ff

File tree

2 files changed

+53
-2
lines changed

2 files changed

+53
-2
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ When available, we try to use the geolocations provided by the data publishers.
4949
| BB | ✅ Yes | WFS |
5050
| HB | ❌ No | - |
5151
| HH | ✅ Yes | WFS |
52-
| HE | ❌ No | - |
52+
| HE | ⚠️ Partial (~90%) | Extracted from OSM on detail pages. The schools without coordinates are schools with placeholder coordinates that are filtered out and schools with no map data at all. |
5353
| MV | ✅ Yes | WFS |
5454
| NI | ❌ No | - |
5555
| NW | ✅ Yes | Converted from EPSG:25832 in source CSV data |

jedeschule/spiders/hessen.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import scrapy
22
import re
3+
from urllib.parse import urlparse, parse_qs
34

45
from scrapy import Item
56

@@ -8,15 +9,24 @@
89

910

1011
class HessenSpider(SchoolSpider):
12+
"""Spider for scraping school data from Hessen's school database
13+
14+
Extracts school information by:
15+
1. Submitting search forms for each school type
16+
2. Parsing result lists for school detail page links
17+
3. Extracting contact info and coordinates from detail pages
18+
"""
1119
name = "hessen"
1220

1321
start_urls = ["https://schul-db.bildung.hessen.de/schul_db.html"]
1422

1523
def parse(self, response):
24+
# Extract all available school types from the dropdown
1625
school_types = response.xpath(
1726
'//select[@id="id_school_type"]/option/@value'
1827
).extract()
1928

29+
# Build search form with empty filters to get all schools per type
2030
form = {
2131
"school_name": "",
2232
"school_town": "",
@@ -28,6 +38,7 @@ def parse(self, response):
2838
"submit_hesse": "Hessische+Schule+suchen+...",
2939
}
3040

41+
# Submit one search per school type to retrieve all schools
3142
for school_type in school_types:
3243
form["school_type"] = school_type
3344

@@ -36,28 +47,48 @@ def parse(self, response):
3647
)
3748

3849
def parse_list(self, response):
50+
# Extract links to individual school detail pages
3951
schools = response.xpath("//tbody/tr/td/a/@href").extract()
4052

4153
for school in schools:
4254
yield scrapy.Request(school, callback=self.parse_details)
4355

56+
def _extract_coords_from_osm_url(self, url: str) -> tuple[float, float] | tuple[None, None]:
57+
"""Extract coordinates from OpenStreetMap iframe URL marker parameter"""
58+
qs = parse_qs(urlparse(url).query)
59+
60+
# Extract marker parameter (format: "latitude,longitude")
61+
if "marker" in qs and qs["marker"]:
62+
try:
63+
lat_str, lon_str = qs["marker"][0].split(",", 1)
64+
return float(lat_str), float(lon_str)
65+
except (ValueError, IndexError):
66+
pass
67+
68+
return None, None
69+
4470
def parse_details(self, response):
71+
# Extract basic school info from <pre> text blocks
4572
contact_text_nodes = response.xpath("//pre/text()").extract()
4673
adress = contact_text_nodes[0].split("\n")
4774

75+
# Parse ZIP and city from line 4 (format: "12345 City Name")
4876
matches = re.search(r"(\d+) (.+)", adress[3])
4977

78+
# Build school dict with required fields
5079
school = {
5180
"name": adress[1],
5281
"straße": adress[2],
5382
"ort": matches.group(2),
5483
"plz": matches.group(1),
5584
}
5685

86+
# Extract optional fax number if present
5787
for text_node in contact_text_nodes:
5888
if "Fax: " in text_node:
5989
school["fax"] = text_node.split("\n")[1].replace("Fax: ", "").strip()
6090

91+
# Extract phone and website from links
6192
contact_links = response.xpath("//pre/a/@href").extract()
6293
for link in contact_links:
6394
if "tel:" in link:
@@ -66,18 +97,36 @@ def parse_details(self, response):
6697
if "http" in link:
6798
school["homepage"] = link
6899

100+
# Extract school type from main content area
69101
school["schultyp"] = (
70102
response.xpath('//main//div[@class="col-md-9 col-lg-9"]/text()')
71103
.extract_first()
72104
.replace("\n", "")
73105
.strip()
74106
)
107+
# Extract school ID from URL query parameter
75108
school["id"] = response.request.url.split("=")[-1]
76109

110+
# Extract coordinates from OpenStreetMap iframe
111+
latitude, longitude = None, None
112+
iframe_src = response.xpath('//iframe[contains(@src, "openstreetmap.org")]/@src').get()
113+
if iframe_src:
114+
latitude, longitude = self._extract_coords_from_osm_url(iframe_src)
115+
116+
# Filter out placeholder coordinates (-1.0, -1.0) used by Hessen DB for missing data
117+
# Example: https://schul-db.bildung.hessen.de/schul_db.html/details/?school_no=9642
118+
if latitude == -1.0 and longitude == -1.0:
119+
latitude = None
120+
longitude = None
121+
122+
school["latitude"] = latitude
123+
school["longitude"] = longitude
124+
77125
yield school
78126

79127
@staticmethod
80128
def normalize(item: Item) -> School:
129+
"""Transform raw scraped data into standardized School model"""
81130
return School(
82131
name=item.get("name"),
83132
phone=item.get("telefon"),
@@ -87,5 +136,7 @@ def normalize(item: Item) -> School:
87136
city=item.get("ort"),
88137
zip=item.get("plz"),
89138
school_type=item.get("schultyp"),
90-
id="HE-{}".format(item.get("id")),
139+
id="HE-{}".format(item.get("id")), # Prefix with state code
140+
latitude=item.get("latitude"),
141+
longitude=item.get("longitude"),
91142
)

0 commit comments

Comments
 (0)