Skip to content

Commit 035dad6

Browse files
authored
Merge pull request #180 from SimonMand/rework_berlin_and_brandenburg_spiders
Reimplement Berlin and Brandenburg spiders
2 parents 5712988 + ba9489f commit 035dad6

File tree

8 files changed

+206
-56
lines changed

8 files changed

+206
-56
lines changed

.github/workflows/test.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,13 @@ jobs:
4040
PGUSER: postgres
4141
- run: |
4242
set -e
43+
echo "🏗 Running alembic migrations..."
4344
uv run alembic upgrade head
45+
46+
echo "🧪 Running test_models.py..."
4447
uv run python test_models.py
48+
49+
echo "🔎 Discovering and running unit tests..."
50+
uv run python -m unittest discover -s test -p "*.py" -v
4551
env:
4652
DATABASE_URL: postgresql://postgres:[email protected]:5432/jedeschule_test

jedeschule/spiders/berlin.py

Lines changed: 6 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,19 @@
1-
import xml.etree.ElementTree as ET
2-
3-
from jedeschule.items import School
41
from scrapy import Item
52

3+
from jedeschule.items import School
64
from jedeschule.spiders.school_spider import SchoolSpider
5+
from jedeschule.wfs_basic_parsers import parse_geojson_features
76

87

98
class BerlinSpider(SchoolSpider):
109
name = "berlin"
1110
start_urls = [
12-
"https://gdi.berlin.de/services/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:4326&typename=fis:schulen"
11+
"https://gdi.berlin.de/services/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:4326"
12+
"&typename=fis:schulen&outputFormat=application/json"
1313
]
1414

15-
def parse(self, response):
16-
tree = ET.fromstring(response.body)
17-
18-
namespaces = {
19-
"gml": "http://www.opengis.net/gml",
20-
"fis": "http://www.berlin.de/broker",
21-
}
22-
for school in tree.find("gml:featureMembers", namespaces).findall(
23-
"{schulen}schulen", namespaces
24-
):
25-
data_elem = {}
26-
for entry in school:
27-
if entry.tag == "{http://www.opengis.net/gml}boundedBy":
28-
continue
29-
if entry.tag == "{schulen}geom":
30-
# This nested entry contains the coordinates that we would like to expand
31-
lon, lat = entry.findtext(
32-
"gml:Point/gml:pos", namespaces=namespaces
33-
).split(" ")
34-
data_elem["lat"] = lat
35-
data_elem["lon"] = lon
36-
continue
37-
# strip the namespace before returning
38-
data_elem[entry.tag.split("}", 1)[1]] = entry.text
39-
yield data_elem
15+
def parse(self, response, **kwargs):
16+
yield from parse_geojson_features(response)
4017

4118
@staticmethod
4219
def normalize(item: Item) -> School:

jedeschule/spiders/brandenburg.py

Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,20 @@
1-
import xml.etree.ElementTree as ET
2-
31
from scrapy import Item
42

53
from jedeschule.items import School
64
from jedeschule.spiders.school_spider import SchoolSpider
5+
from jedeschule.wfs_basic_parsers import parse_geojson_features
76

87

98
class BrandenburgSpider(SchoolSpider):
109
name = "brandenburg"
1110

1211
start_urls = [
13-
"https://schullandschaft.brandenburg.de/edugis/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=ms:Schul_Standorte&srsname=epsg:4326"
12+
"https://schullandschaft.brandenburg.de/edugis/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=ms:Schul_Standorte"
13+
"&srsname=epsg:4326&outputFormat=application/json"
1414
]
1515

16-
def parse(self, response):
17-
tree = ET.fromstring(response.body)
18-
19-
namespaces = {
20-
"gml": "http://www.opengis.net/gml",
21-
"ms": "http://mapserver.gis.umn.edu/mapserver",
22-
}
23-
for school in tree.findall("gml:featureMember", namespaces):
24-
data_elem = {}
25-
for entry in school[0]:
26-
if entry.tag == "{http://mapserver.gis.umn.edu/mapserver}msGeometry":
27-
# This nested entry contains the coordinates that we would like to expand
28-
lat, lon = entry.findtext(
29-
"gml:Point/gml:pos", namespaces=namespaces
30-
).split(" ")
31-
data_elem["lat"] = lat
32-
data_elem["lon"] = lon
33-
continue
34-
# strip the namespace before returning
35-
data_elem[entry.tag.split("}", 1)[1]] = entry.text
36-
yield data_elem
16+
def parse(self, response, **kwargs):
17+
yield from parse_geojson_features(response)
3718

3819
@staticmethod
3920
def normalize(item: Item) -> School:

jedeschule/wfs_basic_parsers.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import json
2+
3+
from scrapy.http import Response
4+
5+
6+
def parse_geojson_features(response: Response):
7+
geojson = json.loads(response.text)
8+
9+
for feature in geojson.get("features", []):
10+
properties = feature.get("properties", {})
11+
coords = feature.get("geometry", {}).get("coordinates", [])
12+
13+
properties["lon"] = coords[0]
14+
properties["lat"] = coords[1]
15+
16+
17+
yield properties

test/__init__.py

Whitespace-only changes.

test/test_berlin.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import unittest
2+
from scrapy.http import TextResponse
3+
from jedeschule.spiders.berlin import BerlinSpider
4+
5+
6+
class TestBerlinSpider(unittest.TestCase):
7+
def test_parse(self):
8+
json_response = """
9+
{
10+
"type": "FeatureCollection",
11+
"features": [
12+
{
13+
"type": "Feature",
14+
"id": "schulen.01A04",
15+
"geometry": {
16+
"type": "Point",
17+
"coordinates": [13.33391576, 52.52672359]
18+
},
19+
"geometry_name": "geom",
20+
"properties": {
21+
"bsn": "01A04",
22+
"schulname": "Berlin-Kolleg",
23+
"schulart": "Kolleg",
24+
"traeger": "öffentlich",
25+
"schultyp": "Andere Schule",
26+
"bezirk": "Mitte",
27+
"ortsteil": "Moabit",
28+
"plz": "10551",
29+
"strasse": "Turmstraße",
30+
"hausnr": "75",
31+
"telefon": "+49 30 901838210",
32+
"fax": "+49 30 901838222",
33+
"email": "[email protected]",
34+
"internet": "https://www.berlin-kolleg.de",
35+
"schuljahr": "2024/25"
36+
},
37+
"bbox": [
38+
13.33391576,
39+
52.52672359,
40+
13.33391576,
41+
52.52672359
42+
]
43+
}
44+
],
45+
"totalFeatures": 925,
46+
"numberMatched": 925,
47+
"numberReturned": 1,
48+
"timeStamp": "2025-06-13T14:59:35.045Z",
49+
"crs": {
50+
"type": "name",
51+
"properties": {
52+
"name": "urn:ogc:def:crs:EPSG::4326"
53+
}
54+
},
55+
"bbox": [
56+
13.33391576,
57+
52.52672359,
58+
13.33391576,
59+
52.52672359
60+
]
61+
}
62+
"""
63+
64+
spider = BerlinSpider()
65+
response = TextResponse(
66+
url="http://test_webserver.com",
67+
body=json_response.encode("utf-8"),
68+
encoding="utf-8",
69+
)
70+
71+
schools = list(spider.parse(response))
72+
self.assertEqual(len(schools), 1)
73+
74+
school = schools[0]
75+
self.assertAlmostEqual(school["lon"], 13.33391576)
76+
self.assertAlmostEqual(school["lat"], 52.52672359)
77+
self.assertEqual(school["bsn"], "01A04")
78+
self.assertEqual(school["schulname"], "Berlin-Kolleg")
79+
self.assertEqual(school["plz"], "10551")
80+
self.assertEqual(school["strasse"], "Turmstraße")
81+
self.assertEqual(school["hausnr"], "75")
82+
self.assertEqual(school["telefon"], "+49 30 901838210")
83+
self.assertEqual(school["fax"], "+49 30 901838222")
84+
self.assertEqual(school["email"], "[email protected]")
85+
self.assertEqual(school["internet"], "https://www.berlin-kolleg.de")
86+
87+
88+
if __name__ == "__main__":
89+
unittest.main()

test/test_brandenburg.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import unittest
2+
from scrapy.http import TextResponse
3+
import json
4+
5+
from jedeschule.spiders.brandenburg import BrandenburgSpider
6+
7+
8+
class TestBrandenburgSpider(unittest.TestCase):
9+
10+
def test_parse(self):
11+
json_text = json.dumps({
12+
"type": "FeatureCollection",
13+
"name": "Schul_Standorte",
14+
"features": [
15+
{
16+
"type": "Feature",
17+
"properties": {
18+
"schul_nr": "100020",
19+
"schulname": "Grundschule Forst Mitte",
20+
"strasse_hausnr": "Max-Fritz-Hammer-Straße 15",
21+
"plz": "03149",
22+
"ort": "Forst (Lausitz)",
23+
"telefonnummer": "(03562) 7163",
24+
"faxnummer": "(03562) 691288",
25+
"dienst_email": "[email protected]",
26+
"homepage": "http://www.grundschule-forst-mitte.de",
27+
"schulamtname": "Staatliches Schulamt Cottbus",
28+
"kreis": "Spree-Neiße",
29+
"schulform_kurzbez": "G",
30+
"schulform": "Grundschule",
31+
"traeger": "Gemeinde",
32+
"schultraeger_grp": "o",
33+
"schueler": "288 (Stand: 2022)",
34+
"besonderheiten_sl": "(763),(561),(132),(201)",
35+
"besonderheiten": [
36+
"Einstiegsphase Startchancen",
37+
"Schule mit Nutzung Schul-Cloud Brandenburg",
38+
"verlässliche Halbtagsschule und Hort",
39+
"FLEX - Optimierung des Schulanfangs"
40+
],
41+
"studienseminar": "2",
42+
"fremdsprachen": ["Englisch"],
43+
"fremdsprachen_sl": "(EN)",
44+
"fremdsprachen_timestmp": "(Schuljahr: 2020/2021)"
45+
},
46+
"geometry": {
47+
"type": "Point",
48+
"coordinates": [14.651148207215728, 51.74023651973522]
49+
}
50+
}
51+
]
52+
})
53+
54+
spider = BrandenburgSpider()
55+
response = TextResponse(
56+
url="http://test_webserver.com",
57+
body=json_text.encode("utf-8"),
58+
encoding="utf-8",
59+
)
60+
61+
results = list(spider.parse(response))
62+
63+
self.assertEqual(len(results), 1)
64+
school = results[0]
65+
66+
self.assertAlmostEqual(school["lat"], 51.74023651973522)
67+
self.assertAlmostEqual(school["lon"], 14.651148207215728)
68+
69+
self.assertEqual(school["schul_nr"], "100020")
70+
self.assertEqual(school["schulname"], "Grundschule Forst Mitte")
71+
self.assertEqual(school["plz"], "03149")
72+
self.assertEqual(school["ort"], "Forst (Lausitz)")
73+
self.assertEqual(school["dienst_email"], "[email protected]")
74+
self.assertEqual(school["schulform"], "Grundschule")
75+
self.assertEqual(school["traeger"], "Gemeinde")
76+
77+
78+
if __name__ == '__main__':
79+
unittest.main()

test_changes.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22

33
set -euxo pipefail
44

5-
if [ $CI ]
6-
then
7-
HEAD_REF=${GITHUB_REF}
5+
git fetch origin main
6+
7+
if [ "${CI:-}" = "true" ]; then
8+
HEAD_REF="${GITHUB_SHA}"
89
else
910
HEAD_REF="HEAD"
1011
fi

0 commit comments

Comments
 (0)