Merge pull request #212 from tifa365/feature/thueringen-wfs

k-nut · web-flow · commit a0f6bd30222e · 2025-10-20T21:14:28.000+02:00
Replace Thüringen HTML scraper with WFS scraper for geolocation support
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ In details, the IDs are sourced as follows:
 |SL| `OBJECTID` from the WFS service                                                                              | `SL-255`                                                                   |❌ no (confirmed with data provider but no alternative available) |
 |SN| Field `id` from the API                                                                                      | `SN-4062`                                                                  |✅ likely|
 |ST| `ID` query param from the details page URL                                                                   | `ST-1001186`                                                               |❓ probably?|
-|TH| `Schulnumer` from school list                                                                                | `TH-10601`                                                                 |✅ likely|
+|TH| `Schulnummer` from the WFS service                                                                           | `TH-10601`                                                                 |✅ likely|
 
 ## Geolocations
 When available, we try to use the geolocations provided by the data publishers.
@@ -57,7 +57,7 @@ When available, we try to use the geolocations provided by the data publishers.
 | SL    | ✅ Yes                | WFS                                          |
 | SN    | ✅ Yes                | API                                          |
 | ST    | ❌ No                 | -                                            |
-| TH    | ❌ No                 | -                                            |
+| TH    | ✅ Yes                | WFS                                          |
 
 ## Installation
 Dependency management is done using [uv](https://docs.astral.sh/uv/). Make sure
diff --git a/jedeschule/spiders/thueringen.py b/jedeschule/spiders/thueringen.py
@@ -1,6 +1,4 @@
-import re
-
-import scrapy
+import xmltodict
 from scrapy import Item
 
 from jedeschule.items import School
@@ -9,101 +7,58 @@
 
 class ThueringenSpider(SchoolSpider):
     name = "thueringen"
-    base_url = "https://www.schulportal-thueringen.de"
-
     start_urls = [
-        "https://www.schulportal-thueringen.de/tip/schulportraet_suche/search.action?tspi=&tspm=&vsid=none&mode=&extended=0&anwf=schulportraet&freitextsuche=&name=&schulnummer=&strasse=&plz=&ort=&schulartDecode=&schulamtDecode=&kzFreierTraeger_cb=1&kzFreierTraeger=2&schultraegerDecode=&sortierungDecode=Schulname&rowsPerPage=999&schulartCode=&schulamtCode=&schultraegerCode=&sortierungCode=10&uniquePortletId=portlet_schulportraet_suche_WAR_tip1109990a_e473_4c62_872b_4ef69bdb6c5d&ajaxId=schulportraet_suche_results"
+        "https://www.geoproxy.geoportal-th.de/geoproxy/services/kommunal/komm_wfs?"
+        "SERVICE=WFS&REQUEST=GetFeature&typeNames=kommunal:komm_schul&"
+        "srsname=EPSG:4326&VERSION=2.0.0"
     ]
 
-    # TODO: parse last_modified
-    def parse(self, response):
-        headers = [
-            header.css("::text").extract_first().strip()
-            for header in response.css("th")
-        ]
-        for tr in response.css(".tispo_row_odd,.tispo_row_normal"):
-            collection = {}
-            tds = tr.css("td")
-            for index, td in enumerate(tds):
-                key = headers[index]
-                value = td.css("::text").extract_first()
-                # The school name is hidden in a link so we check if there
-                # is a link and if yes extract the value from that
-                link_text = td.css("a ::text").extract_first()
-                if link_text:
-                    value = link_text
-                collection[key] = value.strip()
-            # inspect_response(response, self)
-            url = tds[1].css("::attr(href)").extract_first().strip()
-            request = scrapy.Request(self.base_url + url, callback=self.parse_overview)
-            request.meta["collection"] = collection
-            yield request
+    def parse(self, response, **kwargs):
+        data = xmltodict.parse(response.text)
+        members = data.get("wfs:FeatureCollection", {}).get("wfs:member", [])
 
-    def parse_overview(self, response):
-        # inspect_response(response, self)
-        collection = response.meta["collection"]
-        for tr in response.css(".tispo_labelValueView tr"):
-            tds = tr.css("td ::text").extract()
-            # sometimes there is no value for the key
-            if len(tds) >= 2:
-                collection[tds[0][:-1].strip()] = "".join(
-                    [td.strip() for td in tds[1:]]
-                )
-        collection["data_url"] = response.url
-        collection["Leitbild"] = " ".join(
-            response.css(".tispo_htmlUserContent ::text").extract()
-        )
-        yield collection
+        if not isinstance(members, list):
+            members = [members]
+
+        for member in members:
+            school = member.get("kommunal:komm_schul", {})
+
+            data_elem = {}
+
+            # Extract geometry coordinates
+            geom = school.get("kommunal:GEOM", {})
+            point = geom.get("gml:Point", {})
+            pos = point.get("gml:pos", "")
+            if pos:
+                lon, lat = pos.split()
+                data_elem["lat"] = float(lat)
+                data_elem["lon"] = float(lon)
+
+            # Extract all other fields
+            for key, value in school.items():
+                if key not in ("kommunal:GEOM", "@gml:id") and value:
+                    # Remove namespace prefix
+                    clean_key = key.split(":", 1)[-1] if ":" in key else key
+                    data_elem[clean_key] = value
+
+            yield data_elem
 
     @staticmethod
     def normalize(item: Item) -> School:
-        city_parts = item.get("Ort").split()
-        zip, city = city_parts[0], " ".join(city_parts[1:])
         return School(
-            name=item.get("Schulname"),
+            name=item.get("Name"),
             id="TH-{}".format(item.get("Schulnummer")),
-            address=item.get("Straße"),
-            zip=zip,
-            city=city,
-            website=item.get("Internet"),
-            email=ThueringenSpider._deobfuscate_email(item.get("E-Mail")),
+            address=" ".join(
+                filter(None, [item.get("Strasse"), item.get("Hausnummer")])
+            ),
+            zip=item.get("PLZ"),
+            city=item.get("Ort"),
+            website=item.get("Webseite"),
+            email=item.get("EMail"),
             school_type=item.get("Schulart"),
-            provider=item.get("Schulträger"),
-            fax=item.get("Telefax"),
-            phone=item.get("Telefon"),
+            provider=item.get("Traeger"),
+            fax=item.get("Faxnummer"),
+            phone=item.get("Telefonnummer"),
+            latitude=item.get("lat"),
+            longitude=item.get("lon"),
         )
-
-    @staticmethod
-    def _deobfuscate_email(orig):
-        """
-        Reverse-engineered version of the deobfuscation code on the website.
-
-        :param orig: the obfuscated string or the whole function call (`$(function() {...})`),
-            as long as it contains the prefix `#3b` and the suffix `3e#`.
-        :return: the deofuscated string
-        """
-
-        result = ""
-        if orig and re.search(r"#3b[a-z0-9 ]+3e#", orig):
-            orig = re.search(r"#3b[a-z0-9 ]+3e#", orig).group(0)
-            s = (
-                orig.replace(" ", "")
-                .replace("#3b", "")
-                .replace("3e#", "")
-                .replace("o", "")
-            )
-
-            last_value = 0
-            current_value = 0
-            for i, c in enumerate(s):
-                if c.isnumeric():
-                    current_value = int(c)
-                else:
-                    current_value = ord(c) - 97 + 10
-
-                if i % 2 == 1:
-                    t = int(last_value * 23 + current_value) // 2
-                    result += chr(t)
-                last_value = current_value
-
-        return result