Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions alembic/versions/b3913e0b45ac_add_location_to_school.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from alembic import op
import sqlalchemy as sa
from sqlalchemy import text


# revision identifiers, used by Alembic.
Expand All @@ -21,7 +22,7 @@

def upgrade():
conn = op.get_bind()
conn.execute("CREATE EXTENSION IF NOT EXISTS postgis;")
conn.execute(text("CREATE EXTENSION IF NOT EXISTS postgis;"))
op.add_column(
"schools",
sa.Column(
Expand All @@ -37,4 +38,4 @@ def upgrade():
def downgrade():
op.drop_column("schools", "location")
conn = op.get_bind()
conn.execute("DROP EXTENSION IF EXISTS postgis;")
conn.execute(text("DROP EXTENSION IF EXISTS postgis;"))
35 changes: 6 additions & 29 deletions jedeschule/spiders/berlin.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,19 @@
import xml.etree.ElementTree as ET

from jedeschule.items import School
from scrapy import Item

from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider
from jedeschule.utils.wfs_basic_parsers import parse_geojson_features


class BerlinSpider(SchoolSpider):
name = "berlin"
start_urls = [
"https://gdi.berlin.de/services/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:4326&typename=fis:schulen"
"https://gdi.berlin.de/services/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:4326&"
"typename=fis:schulen&outputFormat=application/json"
]

def parse(self, response):
tree = ET.fromstring(response.body)

namespaces = {
"gml": "http://www.opengis.net/gml",
"fis": "http://www.berlin.de/broker",
}
for school in tree.find("gml:featureMembers", namespaces).findall(
"{schulen}schulen", namespaces
):
data_elem = {}
for entry in school:
if entry.tag == "{http://www.opengis.net/gml}boundedBy":
continue
if entry.tag == "{schulen}geom":
# This nested entry contains the coordinates that we would like to expand
lon, lat = entry.findtext(
"gml:Point/gml:pos", namespaces=namespaces
).split(" ")
data_elem["lat"] = lat
data_elem["lon"] = lon
continue
# strip the namespace before returning
data_elem[entry.tag.split("}", 1)[1]] = entry.text
yield data_elem
def parse(self, response, **kwargs):
yield from parse_geojson_features(response)

@staticmethod
def normalize(item: Item) -> School:
Expand Down
29 changes: 5 additions & 24 deletions jedeschule/spiders/brandenburg.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,20 @@
import xml.etree.ElementTree as ET

from scrapy import Item

from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider
from jedeschule.utils.wfs_basic_parsers import parse_geojson_features


class BrandenburgSpider(SchoolSpider):
name = "brandenburg"

start_urls = [
"https://schullandschaft.brandenburg.de/edugis/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=ms:Schul_Standorte&srsname=epsg:4326"
"https://schullandschaft.brandenburg.de/edugis/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=ms:Schul_Standorte"
"&srsname=epsg:4326&outputFormat=application/json"
]

def parse(self, response):
tree = ET.fromstring(response.body)

namespaces = {
"gml": "http://www.opengis.net/gml",
"ms": "http://mapserver.gis.umn.edu/mapserver",
}
for school in tree.findall("gml:featureMember", namespaces):
data_elem = {}
for entry in school[0]:
if entry.tag == "{http://mapserver.gis.umn.edu/mapserver}msGeometry":
# This nested entry contains the coordinates that we would like to expand
lat, lon = entry.findtext(
"gml:Point/gml:pos", namespaces=namespaces
).split(" ")
data_elem["lat"] = lat
data_elem["lon"] = lon
continue
# strip the namespace before returning
data_elem[entry.tag.split("}", 1)[1]] = entry.text
yield data_elem
def parse(self, response, **kwargs):
yield from parse_geojson_features(response)

@staticmethod
def normalize(item: Item) -> School:
Expand Down
54 changes: 29 additions & 25 deletions jedeschule/spiders/hamburg.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,44 @@
import xml.etree.ElementTree as ET

import xmltodict
from scrapy import Item

from jedeschule.spiders.school_spider import SchoolSpider
from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider


class HamburgSpider(SchoolSpider):
name = "hamburg"

start_urls = [
"https://geodienste.hamburg.de/HH_WFS_Schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=de.hh.up:nicht_staatliche_schulen,de.hh.up:staatliche_schulen&srsname=EPSG:4326"
"https://geodienste.hamburg.de/HH_WFS_Schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature"
"&typename=de.hh.up:nicht_staatliche_schulen,de.hh.up:staatliche_schulen&srsname=EPSG:4326"
]

def parse(self, response, **kwargs):
data = xmltodict.parse(response.body)

feature_collection = data.get("wfs:FeatureCollection", {})
members = feature_collection.get("gml:featureMember", [])

if not isinstance(members, list):
members = [members]

for member in members:
school_data = (member.get("de.hh.up:staatliche_schulen") or
member.get("de.hh.up:nicht_staatliche_schulen"))
if not school_data:
continue

result = {}
for key, value in school_data.items():
if key == "de.hh.up:the_geom":
coords = value["gml:Point"]["gml:pos"]
lon, lat = map(float, coords.split())
result["lat"] = lat
result["lon"] = lon
else:
result[key.split(":")[-1]] = value

def parse(self, response):
namespaces = {
"gml": "http://www.opengis.net/gml",
}

elem = ET.fromstring(response.body)

for member in elem:
data_elem = {}
for attr in member[0]:
if attr.tag == "{https://registry.gdi-de.org/id/de.hh.up}the_geom":
# This nested entry contains the coordinates that we would like to expand
lon, lat = attr.findtext(
"gml:Point/gml:pos", namespaces=namespaces
).split(" ")
data_elem["lat"] = lat
data_elem["lon"] = lon
continue
# strip the namespace before returning
data_elem[attr.tag.split("}", 1)[1]] = attr.text
yield data_elem
yield result

@staticmethod
def normalize(item: Item) -> School:
Expand Down
2 changes: 1 addition & 1 deletion jedeschule/spiders/nordrhein_westfalen_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import requests

from jedeschule.utils import singleton
from jedeschule.utils.utils import singleton


@singleton
Expand Down
57 changes: 29 additions & 28 deletions jedeschule/spiders/saarland.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import xmltodict
from scrapy import Item
import xml.etree.ElementTree as ET

from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider
Expand All @@ -8,48 +8,49 @@
class SaarlandSpider(SchoolSpider):
name = "saarland"
start_urls = [
"https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326"
"https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?"
"SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326"
]

def parse(self, response):
tree = ET.fromstring(response.body)
def parse(self, response, **kwargs):
data = xmltodict.parse(response.text)

namespaces = {
"gml": "http://www.opengis.net/gml/3.2",
"SD": "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer",
}
members = data.get("wfs:FeatureCollection", {}).get("wfs:member", [])
if not isinstance(members, list):
members = [members]

for school in tree.iter(
"{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Schulen_SL"
):
for member in members:
school = member.get("Staatliche_Dienste:Schulen_SL", {})
data_elem = {}
for entry in school:
if (
entry.tag
== "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Shape"
):
# This nested entry contains the coordinates that we would like to expand
lat, lon = entry.findtext(
"gml:Point/gml:pos", namespaces=namespaces
).split(" ")
data_elem["lat"] = lat
data_elem["lon"] = lon
continue
# strip the namespace before returning
data_elem[entry.tag.split("}", 1)[1]] = entry.text

for key, value in school.items():
if key == "Staatliche_Dienste:Shape":
pos = (
value.get("gml:Point", {})
.get("gml:pos", "")
.strip()
)
if pos:
lat, lon = pos.split()
data_elem["lat"] = lat
data_elem["lon"] = lon
else:
clean_key = key.split(":")[-1]
data_elem[clean_key] = value

yield data_elem

@staticmethod
def normalize(item: Item) -> School:
# The data also contains a field called `SCHULKENNZ` which implies that it might be an id
# that could be used, but some schools share ids (especially `0` or `000000`) which makes for collisions
id = item.get("OBJECTID")
object_id = item.get("OBJECTID")

return School(
name=item.get("SCHULNAME"),
address=" ".join([item.get(part) for part in ["HNR", "STR_NAME"]]),
address=" ".join(filter(None, [item.get(part) for part in ["HNR", "STR_NAME"]])),
city=item.get("ORT_NAME"),
zip=item.get("PLZ"),
school_type=item.get("SCHULFORM"),
id=f"SL-{id}",
id=f"SL-{object_id}",
)
2 changes: 1 addition & 1 deletion jedeschule/spiders/sachsen_helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import requests

from jedeschule.utils import singleton
from jedeschule.utils.utils import singleton


@singleton
Expand Down
Empty file added jedeschule/utils/__init__.py
Empty file.
File renamed without changes.
20 changes: 20 additions & 0 deletions jedeschule/utils/wfs_basic_parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import json
import logging

from scrapy.http import Response


def parse_geojson_features(response: Response):
geojson = json.loads(response.text)

for feature in geojson.get("features", []):
properties = feature.get("properties", {})
coords = feature.get("geometry", {}).get("coordinates", [])

try:
properties["lon"] = coords[0]
properties["lat"] = coords[1]
except (TypeError, IndexError):
logging.warning("Skipping feature with invalid geometry")

yield properties
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ dependencies = [
"requests==2.32.3",
"scrapy==2.13.1",
"sqlalchemy==1.3.10",
"xmltodict==0.13.0",
]
Loading