Skip to content

Commit 0feff03

Browse files
tifa365tim
andauthored
[ST]: Replace HTML scraper with ArcGIS API scraper (#219)
* Replace Sachsen-Anhalt HTML scraper with ArcGIS API scraper Switch from HTML scraping (bildung-lsa.de) to ArcGIS FeatureServer API. This provides cleaner data access and adds geolocation support. Changes: - Replace HTML parsing with ArcGIS REST API JSON parsing - Add coordinate transformation (EPSG:25832 -> WGS84) using pyproj - Add geolocation coverage: 100% (857 schools) - Update ID scheme: ST-1001186 -> ST-ARC00001 (OBJECTID-based) - Update README: Mark ST as having geolocation via ArcGIS - Note: OBJECTID stability is uncertain (may change on reimport) Data source: services-eu1.arcgis.com ArcGIS FeatureServer Coverage: 857 schools (excludes vocational schools) * Remove personal dev files from .gitignore (now in global gitignore) * Use None instead of empty strings for missing data Replace empty string defaults with None when extracting school attributes. This provides clearer semantics for missing data and follows database best practices (NULL vs empty string). Note: Current ArcGIS dataset has no missing values, but this change future-proofs the code and follows Python/SQL conventions. * Remove exception handling from coordinate transformation Let coordinate transformation fail loudly if it encounters issues rather than silently logging a warning and continuing with null coordinates. This follows the 'fail fast' principle - if transformation fails, we want to know immediately so we can fix the root cause rather than silently producing incomplete data. --------- Co-authored-by: tim <[email protected]>
1 parent 230ee34 commit 0feff03

File tree

2 files changed

+46
-45
lines changed

2 files changed

+46
-45
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ In details, the IDs are sourced as follows:
3535
|RP| `Schulnummer` from the school's details page | `RP-50720` |✅ likely|
3636
|SL| `OBJECTID` from the WFS service | `SL-255` |❌ no (confirmed with data provider but no alternative available) |
3737
|SN| Field `id` from the API | `SN-4062` |✅ likely|
38-
|ST| `ID` query param from the details page URL | `ST-1001186` |probably?|
38+
|ST| `OBJECTID` from the ArcGIS FeatureServer API (prefixed with `ARC`) | `ST-ARC00001` |unlikely (OBJECTID may change on data reimport)|
3939
|TH| `Schulnummer` from the WFS service | `TH-10601` |✅ likely|
4040

4141
## Geolocations
@@ -56,7 +56,7 @@ When available, we try to use the geolocations provided by the data publishers.
5656
| RP | ❌ No | - |
5757
| SL | ✅ Yes | WFS |
5858
| SN | ✅ Yes | API |
59-
| ST | ❌ No | - |
59+
| ST | ✅ Yes | ArcGIS (converted from EPSG:25832) |
6060
| TH | ✅ Yes | WFS |
6161

6262
## Additional Data Fields
Lines changed: 44 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,65 @@
11
# -*- coding: utf-8 -*-
2-
import re
32
import scrapy
43
from scrapy import Item
4+
from pyproj import Transformer
55
from jedeschule.items import School
66
from jedeschule.spiders.school_spider import SchoolSpider
77

88

99
class SachsenAnhaltSpider(SchoolSpider):
1010
name = "sachsen-anhalt"
11+
12+
# ArcGIS FeatureServer API - contains 857 schools with coordinates
13+
# Note: This dataset excludes vocational schools (Berufsbildende Schulen)
14+
# but includes all primary, secondary, grammar, and special needs schools
1115
start_urls = [
12-
"https://www.bildung-lsa.de/ajax.php?m=getSSResult&q=&lk=-1&sf=-1&so=-1&timestamp=1480082277128/"
16+
"https://services-eu1.arcgis.com/3jNCHSftk0N4t7dd/arcgis/rest/services/"
17+
"Schulenstandorte_EPSG25832_2024_25_Sicht/FeatureServer/44/query?"
18+
"where=1%3D1&outFields=*&f=json"
1319
]
1420

15-
detail_url = "https://www.bildung-lsa.de/ajax.php?m=getSSDetails&id={}&timestamp=1480082332787"
16-
1721
def parse(self, response):
18-
js_callbacks = response.css("span ::attr(onclick)").extract()
19-
pattern = "getSSResultItemDetail\((\d+)\)"
20-
ids = [re.match(pattern, text).group(1) for text in js_callbacks]
21-
names = response.css("b::text").extract()
22-
for id, name in zip(ids, names):
23-
request = scrapy.Request(
24-
self.detail_url.format(id), callback=self.parse_detail
25-
)
26-
request.meta["id"] = id
27-
request.meta["name"] = name.strip()
28-
yield request
22+
"""Parse ArcGIS FeatureServer JSON response"""
23+
data = response.json()
2924

30-
def parse_detail(self, response):
31-
tables = response.css("table")
25+
# EPSG:25832 (UTM zone 32N) to EPSG:4326 (WGS84) transformer
26+
transformer = Transformer.from_crs("EPSG:25832", "EPSG:4326", always_xy=True)
3227

33-
content = {}
34-
# Only the second and third table contain interesting data
35-
for table in tables[1:3]:
36-
trs = table.css("tr")
37-
for tr in trs:
38-
tds = tr.css("td")
39-
key = tds[0].css("::text").extract_first()[:-2]
40-
value = " ".join(tds[1].css("::text").extract())
28+
for feature in data.get("features", []):
29+
attrs = feature["attributes"]
30+
geom = feature.get("geometry", {})
4131

42-
content[key] = value
43-
content["Name"] = response.meta["name"]
44-
content["ID"] = response.meta["id"]
45-
# The name is included in the "Adresse" field so we remove that
46-
# in order to get only the address
47-
content["Adresse"] = (
48-
content["Adresse"].replace(response.meta["name"], "").strip()
49-
)
50-
yield content
32+
# Transform coordinates from EPSG:25832 to WGS84
33+
latitude = None
34+
longitude = None
35+
if geom and "x" in geom and "y" in geom:
36+
longitude, latitude = transformer.transform(geom["x"], geom["y"])
37+
38+
# Extract school information from ArcGIS attributes
39+
yield {
40+
"name": attrs.get("Name"),
41+
"city": attrs.get("Ort"),
42+
"school_type": attrs.get("Schulform"),
43+
"category": attrs.get("Kategorie"),
44+
"provider": attrs.get("Traeg_Anw"),
45+
"latitude": latitude,
46+
"longitude": longitude,
47+
"object_id": attrs.get("OBJECTID"),
48+
}
5149

5250
@staticmethod
5351
def normalize(item: Item) -> School:
52+
"""Normalize ArcGIS data to School item"""
53+
# Generate ID from OBJECTID
54+
school_id = f"ST-ARC{item.get('object_id', 0):05d}"
55+
5456
return School(
55-
name=item.get("Name"),
56-
id="ST-{}".format(item.get("ID")),
57-
address=re.split(r"\d{5}", item.get("Adresse").strip())[0].strip(),
58-
zip=re.findall(r"\d{5}", item.get("Adresse").strip())[0],
59-
city=re.split(r"\d{5}", item.get("Adresse").strip())[1].strip(),
60-
website=item.get("Homepage"),
61-
email=item.get("E-Mail"),
62-
fax=item.get("Telefax"),
63-
phone=item.get("Telefon"),
57+
name=item.get("name"),
58+
id=school_id,
59+
city=item.get("city"),
60+
school_type=item.get("school_type"),
61+
legal_status=item.get("category"),
62+
provider=item.get("provider"),
63+
latitude=item.get("latitude"),
64+
longitude=item.get("longitude"),
6465
)

0 commit comments

Comments
 (0)