diff --git a/alembic/versions/b3913e0b45ac_add_location_to_school.py b/alembic/versions/b3913e0b45ac_add_location_to_school.py
index 1d4e363..e5a7a9f 100644
--- a/alembic/versions/b3913e0b45ac_add_location_to_school.py
+++ b/alembic/versions/b3913e0b45ac_add_location_to_school.py
@@ -10,6 +10,7 @@
from alembic import op
import sqlalchemy as sa
+from sqlalchemy import text
# revision identifiers, used by Alembic.
@@ -21,7 +22,7 @@
def upgrade():
conn = op.get_bind()
- conn.execute("CREATE EXTENSION IF NOT EXISTS postgis;")
+ conn.execute(text("CREATE EXTENSION IF NOT EXISTS postgis;"))
op.add_column(
"schools",
sa.Column(
@@ -37,4 +38,4 @@ def upgrade():
def downgrade():
op.drop_column("schools", "location")
conn = op.get_bind()
- conn.execute("DROP EXTENSION IF EXISTS postgis;")
+ conn.execute(text("DROP EXTENSION IF EXISTS postgis;"))
diff --git a/jedeschule/spiders/berlin.py b/jedeschule/spiders/berlin.py
index 055cc60..c39950e 100644
--- a/jedeschule/spiders/berlin.py
+++ b/jedeschule/spiders/berlin.py
@@ -1,42 +1,19 @@
-import xml.etree.ElementTree as ET
-
-from jedeschule.items import School
from scrapy import Item
+from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider
+from jedeschule.utils.wfs_basic_parsers import parse_geojson_features
class BerlinSpider(SchoolSpider):
name = "berlin"
start_urls = [
- "https://gdi.berlin.de/services/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:4326&typename=fis:schulen"
+ "https://gdi.berlin.de/services/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:4326&"
+ "typename=fis:schulen&outputFormat=application/json"
]
- def parse(self, response):
- tree = ET.fromstring(response.body)
-
- namespaces = {
- "gml": "http://www.opengis.net/gml",
- "fis": "http://www.berlin.de/broker",
- }
- for school in tree.find("gml:featureMembers", namespaces).findall(
- "{schulen}schulen", namespaces
- ):
- data_elem = {}
- for entry in school:
- if entry.tag == "{http://www.opengis.net/gml}boundedBy":
- continue
- if entry.tag == "{schulen}geom":
- # This nested entry contains the coordinates that we would like to expand
- lon, lat = entry.findtext(
- "gml:Point/gml:pos", namespaces=namespaces
- ).split(" ")
- data_elem["lat"] = lat
- data_elem["lon"] = lon
- continue
- # strip the namespace before returning
- data_elem[entry.tag.split("}", 1)[1]] = entry.text
- yield data_elem
+ def parse(self, response, **kwargs):
+ yield from parse_geojson_features(response)
@staticmethod
def normalize(item: Item) -> School:
diff --git a/jedeschule/spiders/brandenburg.py b/jedeschule/spiders/brandenburg.py
index 1e1c67c..ea95da1 100644
--- a/jedeschule/spiders/brandenburg.py
+++ b/jedeschule/spiders/brandenburg.py
@@ -1,39 +1,20 @@
-import xml.etree.ElementTree as ET
-
from scrapy import Item
from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider
+from jedeschule.utils.wfs_basic_parsers import parse_geojson_features
class BrandenburgSpider(SchoolSpider):
name = "brandenburg"
start_urls = [
- "https://schullandschaft.brandenburg.de/edugis/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=ms:Schul_Standorte&srsname=epsg:4326"
+ "https://schullandschaft.brandenburg.de/edugis/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=ms:Schul_Standorte"
+ "&srsname=epsg:4326&outputFormat=application/json"
]
- def parse(self, response):
- tree = ET.fromstring(response.body)
-
- namespaces = {
- "gml": "http://www.opengis.net/gml",
- "ms": "http://mapserver.gis.umn.edu/mapserver",
- }
- for school in tree.findall("gml:featureMember", namespaces):
- data_elem = {}
- for entry in school[0]:
- if entry.tag == "{http://mapserver.gis.umn.edu/mapserver}msGeometry":
- # This nested entry contains the coordinates that we would like to expand
- lat, lon = entry.findtext(
- "gml:Point/gml:pos", namespaces=namespaces
- ).split(" ")
- data_elem["lat"] = lat
- data_elem["lon"] = lon
- continue
- # strip the namespace before returning
- data_elem[entry.tag.split("}", 1)[1]] = entry.text
- yield data_elem
+ def parse(self, response, **kwargs):
+ yield from parse_geojson_features(response)
@staticmethod
def normalize(item: Item) -> School:
diff --git a/jedeschule/spiders/hamburg.py b/jedeschule/spiders/hamburg.py
index d187655..0511dbe 100644
--- a/jedeschule/spiders/hamburg.py
+++ b/jedeschule/spiders/hamburg.py
@@ -1,40 +1,44 @@
-import xml.etree.ElementTree as ET
-
+import xmltodict
from scrapy import Item
-from jedeschule.spiders.school_spider import SchoolSpider
from jedeschule.items import School
+from jedeschule.spiders.school_spider import SchoolSpider
class HamburgSpider(SchoolSpider):
name = "hamburg"
start_urls = [
- "https://geodienste.hamburg.de/HH_WFS_Schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=de.hh.up:nicht_staatliche_schulen,de.hh.up:staatliche_schulen&srsname=EPSG:4326"
+ "https://geodienste.hamburg.de/HH_WFS_Schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature"
+ "&typename=de.hh.up:nicht_staatliche_schulen,de.hh.up:staatliche_schulen&srsname=EPSG:4326"
]
+ def parse(self, response, **kwargs):
+ data = xmltodict.parse(response.body)
+
+ feature_collection = data.get("wfs:FeatureCollection", {})
+ members = feature_collection.get("gml:featureMember", [])
+
+ if not isinstance(members, list):
+ members = [members]
+
+ for member in members:
+ school_data = (member.get("de.hh.up:staatliche_schulen") or
+ member.get("de.hh.up:nicht_staatliche_schulen"))
+ if not school_data:
+ continue
+
+ result = {}
+ for key, value in school_data.items():
+ if key == "de.hh.up:the_geom":
+ coords = value["gml:Point"]["gml:pos"]
+ lon, lat = map(float, coords.split())
+ result["lat"] = lat
+ result["lon"] = lon
+ else:
+ result[key.split(":")[-1]] = value
- def parse(self, response):
- namespaces = {
- "gml": "http://www.opengis.net/gml",
- }
-
- elem = ET.fromstring(response.body)
-
- for member in elem:
- data_elem = {}
- for attr in member[0]:
- if attr.tag == "{https://registry.gdi-de.org/id/de.hh.up}the_geom":
- # This nested entry contains the coordinates that we would like to expand
- lon, lat = attr.findtext(
- "gml:Point/gml:pos", namespaces=namespaces
- ).split(" ")
- data_elem["lat"] = lat
- data_elem["lon"] = lon
- continue
- # strip the namespace before returning
- data_elem[attr.tag.split("}", 1)[1]] = attr.text
- yield data_elem
+ yield result
@staticmethod
def normalize(item: Item) -> School:
diff --git a/jedeschule/spiders/nordrhein_westfalen_helper.py b/jedeschule/spiders/nordrhein_westfalen_helper.py
index 4c9bcef..7a5d3dd 100644
--- a/jedeschule/spiders/nordrhein_westfalen_helper.py
+++ b/jedeschule/spiders/nordrhein_westfalen_helper.py
@@ -2,7 +2,7 @@
import requests
-from jedeschule.utils import singleton
+from jedeschule.utils.utils import singleton
@singleton
diff --git a/jedeschule/spiders/saarland.py b/jedeschule/spiders/saarland.py
index b3ace2d..63e5eff 100644
--- a/jedeschule/spiders/saarland.py
+++ b/jedeschule/spiders/saarland.py
@@ -1,5 +1,5 @@
+import xmltodict
from scrapy import Item
-import xml.etree.ElementTree as ET
from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider
@@ -8,48 +8,49 @@
class SaarlandSpider(SchoolSpider):
name = "saarland"
start_urls = [
- "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326"
+ "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?"
+ "SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326"
]
- def parse(self, response):
- tree = ET.fromstring(response.body)
+ def parse(self, response, **kwargs):
+ data = xmltodict.parse(response.text)
- namespaces = {
- "gml": "http://www.opengis.net/gml/3.2",
- "SD": "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer",
- }
+ members = data.get("wfs:FeatureCollection", {}).get("wfs:member", [])
+ if not isinstance(members, list):
+ members = [members]
- for school in tree.iter(
- "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Schulen_SL"
- ):
+ for member in members:
+ school = member.get("Staatliche_Dienste:Schulen_SL", {})
data_elem = {}
- for entry in school:
- if (
- entry.tag
- == "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Shape"
- ):
- # This nested entry contains the coordinates that we would like to expand
- lat, lon = entry.findtext(
- "gml:Point/gml:pos", namespaces=namespaces
- ).split(" ")
- data_elem["lat"] = lat
- data_elem["lon"] = lon
- continue
- # strip the namespace before returning
- data_elem[entry.tag.split("}", 1)[1]] = entry.text
+
+ for key, value in school.items():
+ if key == "Staatliche_Dienste:Shape":
+ pos = (
+ value.get("gml:Point", {})
+ .get("gml:pos", "")
+ .strip()
+ )
+ if pos:
+ lat, lon = pos.split()
+ data_elem["lat"] = lat
+ data_elem["lon"] = lon
+ else:
+ clean_key = key.split(":")[-1]
+ data_elem[clean_key] = value
+
yield data_elem
@staticmethod
def normalize(item: Item) -> School:
# The data also contains a field called `SCHULKENNZ` which implies that it might be an id
# that could be used, but some schools share ids (especially `0` or `000000`) which makes for collisions
- id = item.get("OBJECTID")
+ object_id = item.get("OBJECTID")
return School(
name=item.get("SCHULNAME"),
- address=" ".join([item.get(part) for part in ["HNR", "STR_NAME"]]),
+ address=" ".join(filter(None, [item.get(part) for part in ["HNR", "STR_NAME"]])),
city=item.get("ORT_NAME"),
zip=item.get("PLZ"),
school_type=item.get("SCHULFORM"),
- id=f"SL-{id}",
+ id=f"SL-{object_id}",
)
diff --git a/jedeschule/spiders/sachsen_helper.py b/jedeschule/spiders/sachsen_helper.py
index 0a6206f..7014ddf 100644
--- a/jedeschule/spiders/sachsen_helper.py
+++ b/jedeschule/spiders/sachsen_helper.py
@@ -1,6 +1,6 @@
import requests
-from jedeschule.utils import singleton
+from jedeschule.utils.utils import singleton
@singleton
diff --git a/jedeschule/utils/__init__.py b/jedeschule/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/jedeschule/utils.py b/jedeschule/utils/utils.py
similarity index 100%
rename from jedeschule/utils.py
rename to jedeschule/utils/utils.py
diff --git a/jedeschule/utils/wfs_basic_parsers.py b/jedeschule/utils/wfs_basic_parsers.py
new file mode 100644
index 0000000..9c7e1ca
--- /dev/null
+++ b/jedeschule/utils/wfs_basic_parsers.py
@@ -0,0 +1,20 @@
+import json
+import logging
+
+from scrapy.http import Response
+
+
+def parse_geojson_features(response: Response):
+ geojson = json.loads(response.text)
+
+ for feature in geojson.get("features", []):
+ properties = feature.get("properties", {})
+ coords = feature.get("geometry", {}).get("coordinates", [])
+
+ try:
+ properties["lon"] = coords[0]
+ properties["lat"] = coords[1]
+ except (TypeError, IndexError):
+ logging.warning("Skipping feature with invalid geometry")
+
+ yield properties
diff --git a/pyproject.toml b/pyproject.toml
index a8978a4..bb0739c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,4 +13,5 @@ dependencies = [
"requests==2.32.3",
"scrapy==2.13.1",
"sqlalchemy==1.3.10",
+ "xmltodict==0.13.0",
]
diff --git a/test/test_berlin.py b/test/test_berlin.py
new file mode 100644
index 0000000..c17fe13
--- /dev/null
+++ b/test/test_berlin.py
@@ -0,0 +1,90 @@
+import unittest
+from scrapy.http import TextResponse
+from jedeschule.spiders.berlin import BerlinSpider
+
+
+class TestBerlinSpider(unittest.TestCase):
+ def test_parse(self):
+ json_response = """
+ {
+ "type": "FeatureCollection",
+ "features": [
+ {
+ "type": "Feature",
+ "id": "schulen.01A04",
+ "geometry": {
+ "type": "Point",
+ "coordinates": [13.33391576, 52.52672359]
+ },
+ "geometry_name": "geom",
+ "properties": {
+ "bsn": "01A04",
+ "schulname": "Berlin-Kolleg",
+ "schulart": "Kolleg",
+ "traeger": "öffentlich",
+ "schultyp": "Andere Schule",
+ "bezirk": "Mitte",
+ "ortsteil": "Moabit",
+ "plz": "10551",
+ "strasse": "Turmstraße",
+ "hausnr": "75",
+ "telefon": "+49 30 901838210",
+ "fax": "+49 30 901838222",
+ "email": "sekretariat@berlin-kolleg.de",
+ "internet": "https://www.berlin-kolleg.de",
+ "schuljahr": "2024/25"
+ },
+ "bbox": [
+ 13.33391576,
+ 52.52672359,
+ 13.33391576,
+ 52.52672359
+ ]
+ }
+ ],
+ "totalFeatures": 925,
+ "numberMatched": 925,
+ "numberReturned": 1,
+ "timeStamp": "2025-06-13T14:59:35.045Z",
+ "crs": {
+ "type": "name",
+ "properties": {
+ "name": "urn:ogc:def:crs:EPSG::4326"
+ }
+ },
+ "bbox": [
+ 13.33391576,
+ 52.52672359,
+ 13.33391576,
+ 52.52672359
+ ]
+ }
+ """
+
+ spider = BerlinSpider()
+ response = TextResponse(
+ url="http://example.com",
+ body=json_response.encode("utf-8"),
+ encoding="utf-8",
+ )
+
+
+ schools = list(spider.parse(response))
+ self.assertEqual(len(schools), 1)
+
+ school = schools[0]
+ self.assertAlmostEqual(school["lon"], 13.33391576)
+ self.assertAlmostEqual(school["lat"], 52.52672359)
+ self.assertEqual(school["bsn"], "01A04")
+ self.assertEqual(school["schulname"], "Berlin-Kolleg")
+ self.assertEqual(school["plz"], "10551")
+ self.assertEqual(school["strasse"], "Turmstraße")
+ self.assertEqual(school["hausnr"], "75")
+ self.assertEqual(school["telefon"], "+49 30 901838210")
+ self.assertEqual(school["fax"], "+49 30 901838222")
+ self.assertEqual(school["email"], "sekretariat@berlin-kolleg.de")
+ self.assertEqual(school["internet"], "https://www.berlin-kolleg.de")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/test/test_brandenburg.py b/test/test_brandenburg.py
new file mode 100644
index 0000000..dc91713
--- /dev/null
+++ b/test/test_brandenburg.py
@@ -0,0 +1,75 @@
+import unittest
+from types import SimpleNamespace
+import json
+
+from jedeschule.spiders.brandenburg import BrandenburgSpider
+
+
+class TestBrandenburgSpider(unittest.TestCase):
+
+ def test_parse(self):
+ json_text = json.dumps({
+ "type": "FeatureCollection",
+ "name": "Schul_Standorte",
+ "features": [
+ {
+ "type": "Feature",
+ "properties": {
+ "schul_nr": "100020",
+ "schulname": "Grundschule Forst Mitte",
+ "strasse_hausnr": "Max-Fritz-Hammer-Straße 15",
+ "plz": "03149",
+ "ort": "Forst (Lausitz)",
+ "telefonnummer": "(03562) 7163",
+ "faxnummer": "(03562) 691288",
+ "dienst_email": "s100020@schulen.brandenburg.de",
+ "homepage": "http://www.grundschule-forst-mitte.de",
+ "schulamtname": "Staatliches Schulamt Cottbus",
+ "kreis": "Spree-Neiße",
+ "schulform_kurzbez": "G",
+ "schulform": "Grundschule",
+ "traeger": "Gemeinde",
+ "schultraeger_grp": "o",
+ "schueler": "288 (Stand: 2022)",
+ "besonderheiten_sl": "(763),(561),(132),(201)",
+ "besonderheiten": [
+ "Einstiegsphase Startchancen",
+ "Schule mit Nutzung Schul-Cloud Brandenburg",
+ "verlässliche Halbtagsschule und Hort",
+ "FLEX - Optimierung des Schulanfangs"
+ ],
+ "studienseminar": "2",
+ "fremdsprachen": ["Englisch"],
+ "fremdsprachen_sl": "(EN)",
+ "fremdsprachen_timestmp": "(Schuljahr: 2020/2021)"
+ },
+ "geometry": {
+ "type": "Point",
+ "coordinates": [14.651148207215728, 51.74023651973522]
+ }
+ }
+ ]
+ })
+
+ spider = BrandenburgSpider()
+ response = SimpleNamespace(text=json_text)
+
+ results = list(spider.parse(response))
+
+ self.assertEqual(len(results), 1)
+ school = results[0]
+
+ self.assertAlmostEqual(school["lat"], 51.74023651973522)
+ self.assertAlmostEqual(school["lon"], 14.651148207215728)
+
+ self.assertEqual(school["schul_nr"], "100020")
+ self.assertEqual(school["schulname"], "Grundschule Forst Mitte")
+ self.assertEqual(school["plz"], "03149")
+ self.assertEqual(school["ort"], "Forst (Lausitz)")
+ self.assertEqual(school["dienst_email"], "s100020@schulen.brandenburg.de")
+ self.assertEqual(school["schulform"], "Grundschule")
+ self.assertEqual(school["traeger"], "Gemeinde")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/test/test_hamburg.py b/test/test_hamburg.py
new file mode 100644
index 0000000..5f50e8c
--- /dev/null
+++ b/test/test_hamburg.py
@@ -0,0 +1,78 @@
+import unittest
+
+from scrapy.http import TextResponse, Request
+
+from jedeschule.spiders.hamburg import HamburgSpider
+
+
+class TestHamburgSpider(unittest.TestCase):
+
+ def test_parse(self):
+ xml_body = """
+
+
+
+ 22159 Hamburg
+ Rahlstedter Weg 15
+ 417
+ 417 an 1 Standort
+ Wandsbek
+ +49 40 53 30 43-29
+ true
+ Grundschulen
+ No
+ http://rebbz-wandsbek-sued.hamburg.de/
+ staatlich anerkannte Ersatzschule
+ 2024
+ sekretariat@kath-schule-farmsen.kseh.de
+ http://www.ksfhh.de
+ 3213-0
+ +49 40 53 30 43-10
+ Berend Loges
+ Grundschule|Vorschulklasse
+ Katholische Schule Farmsen
+ Hauptstandort
+ Stufe 4
+ Farmsen-Berne
+ 451
+ ReBBZ Wandsbek-Süd
+
+
+
+ 10.121824 53.606715
+
+
+
+
+
+ """
+
+ spider = HamburgSpider()
+ response = TextResponse(
+ url="https://test.com",
+ request=Request(url="https://test.com"),
+ body=xml_body,
+ encoding="utf-8"
+ )
+
+ schools = list(spider.parse(response))
+ self.assertEqual(len(schools), 1)
+
+ school_in_hamburg = schools[0]
+ self.assertEqual(school_in_hamburg["schulname"], "Katholische Schule Farmsen")
+ self.assertEqual(school_in_hamburg["schul_id"], "3213-0")
+ self.assertEqual(school_in_hamburg["adresse_strasse_hausnr"], "Rahlstedter Weg 15")
+ self.assertEqual(school_in_hamburg["adresse_ort"], "22159 Hamburg")
+ self.assertEqual(school_in_hamburg["schul_homepage"], "http://www.ksfhh.de")
+ self.assertEqual(school_in_hamburg["schul_email"], "sekretariat@kath-schule-farmsen.kseh.de")
+ self.assertEqual(school_in_hamburg["schulform"], "Grundschule|Vorschulklasse")
+ self.assertEqual(school_in_hamburg["fax"], "+49 40 53 30 43-29")
+ self.assertEqual(school_in_hamburg["schul_telefonnr"], "+49 40 53 30 43-10")
+ self.assertEqual(school_in_hamburg["lat"], 53.606715)
+ self.assertEqual(school_in_hamburg["lon"], 10.121824)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_saarland.py b/test/test_saarland.py
new file mode 100644
index 0000000..13ce12b
--- /dev/null
+++ b/test/test_saarland.py
@@ -0,0 +1,72 @@
+import unittest
+
+from scrapy.http import TextResponse
+
+from jedeschule.spiders.saarland import SaarlandSpider
+
+
+class TestSaarlandSpider(unittest.TestCase):
+
+ def test_parse(self):
+ xml = """
+
+
+
+ 1
+
+
+ 49.24067452 7.02085050
+
+
+ 41
+ 66123
+ Saarbrücken
+ St.Johann
+ Kohlweg
+ 7
+
+ Hochschule
+ Deutsch-Französiche Hochschule, Universite´ franco-allemande
+
+
+
+ Regionalverband Saarbrücken
+ 0
+ Hochschule
+ 2574380.85600000
+ 5456457.44000000
+ Geodatenzentrum, Stand: 29.11.2022
+
+
+
+ """
+
+ spider = SaarlandSpider()
+ response = TextResponse(url="https://test.com", body=xml, encoding="utf-8")
+
+ schools = list(spider.parse(response))
+
+ self.assertEqual(len(schools), 1)
+
+ school = schools[0]
+
+ self.assertEqual(school["lat"], "49.24067452")
+ self.assertEqual(school["lon"], "7.02085050")
+
+ self.assertEqual(school["OBJECTID"], "1")
+ self.assertEqual(school["KREIS"], "41")
+ self.assertEqual(school["PLZ"], "66123")
+ self.assertEqual(school["ORT_NAME"], "Saarbrücken")
+ self.assertEqual(school["POST_ORT"], "St.Johann")
+ self.assertEqual(school["STR_NAME"], "Kohlweg")
+ self.assertEqual(school["HNR"], "7")
+ self.assertEqual(school["SCHULFORM"], "Hochschule")
+ self.assertEqual(school["SCHULNAME"],
+ "Deutsch-Französiche Hochschule, Universite´ franco-allemande")
+ self.assertEqual(school["SCHULREGIO"], "Regionalverband Saarbrücken")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/test_changes.sh b/test_changes.sh
index 6d0f280..398f992 100755
--- a/test_changes.sh
+++ b/test_changes.sh
@@ -11,13 +11,14 @@ fi
echo "Using head reference: ${HEAD_REF}"
-CHANGED_SCRAPERS=$(git whatchanged --name-only --pretty="" origin/main..${HEAD_REF} |
+CHANGED_SCRAPERS=$(git diff --name-only origin/main...${HEAD_REF} |
grep spiders |
grep -v helper |
- sed 's/jedeschule\/spiders\///' |
- sed 's/\.py//' |
- sed 's/_/\-/' |
- uniq) || true
+ sed 's|jedeschule/spiders/||' |
+ sed 's|\.py||' |
+ sed 's|_|-|' |
+ sort -u) || true
+
if [ -z "$CHANGED_SCRAPERS" ]; then
echo "No scrapers were changed"
diff --git a/uv.lock b/uv.lock
index 8a20f59..20e644d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -270,6 +270,7 @@ dependencies = [
{ name = "requests" },
{ name = "scrapy" },
{ name = "sqlalchemy" },
+ { name = "xmltodict" },
]
[package.metadata]
@@ -282,6 +283,7 @@ requires-dist = [
{ name = "requests", specifier = "==2.32.3" },
{ name = "scrapy", specifier = "==2.13.1" },
{ name = "sqlalchemy", specifier = "==1.3.10" },
+ { name = "xmltodict", specifier = "==0.13.0" },
]
[[package]]
@@ -702,6 +704,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/58/dd/56f0d8af71e475ed194d702f8b4cf9cea812c95e82ad823d239023c6558c/w3lib-2.3.1-py3-none-any.whl", hash = "sha256:9ccd2ae10c8c41c7279cd8ad4fe65f834be894fe7bfdd7304b991fd69325847b", size = 21751, upload-time = "2025-01-27T14:22:09.421Z" },
]
+[[package]]
+name = "xmltodict"
+version = "0.13.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/39/0d/40df5be1e684bbaecdb9d1e0e40d5d482465de6b00cbb92b84ee5d243c7f/xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56", size = 33813, upload-time = "2022-05-08T07:00:04.916Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/94/db/fd0326e331726f07ff7f40675cd86aa804bfd2e5016c727fa761c934990e/xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852", size = 9971, upload-time = "2022-05-08T07:00:02.898Z" },
+]
+
[[package]]
name = "zope-interface"
version = "7.2"