Skip to content

Commit bb424fe

Browse files
committed
[RP] Adapt to new website
1 parent d4b8f95 commit bb424fe

File tree

1 file changed

+8
-7
lines changed

1 file changed

+8
-7
lines changed

jedeschule/spiders/rheinland_pfalz.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from scrapy import Item
22
from scrapy.linkextractors import LinkExtractor
3+
import re
34

45
from scrapy.spiders import CrawlSpider, Rule
56

@@ -11,24 +12,25 @@ class RheinlandPfalzSpider(CrawlSpider, SchoolSpider):
1112
name = "rheinland-pfalz"
1213
# Note, one could also use the geo portal:
1314
# https://www.geoportal.rlp.de/spatial-objects/350/collections/schulstandorte/items?f=html&limit=4000
14-
start_urls = ["https://schulen.bildung-rp.de"]
15+
start_urls = ["https://bildung.rlp.de/schulen"]
1516
rules = [
1617
Rule(
17-
LinkExtractor(allow="https://schulen.bildung-rp.de/einzelanzeige.html?.*"),
18+
LinkExtractor(allow="https://bildung.rlp.de/schulen/einzelanzeige.*"),
1819
callback="parse_school",
1920
follow=False,
2021
)
2122
]
2223

2324
# get the information
2425
def parse_school(self, response):
25-
container = response.css("#wfqbeResults")
26+
container = response.css(".rlp-schooldatabase-detail")
2627
item = {"name": container.css("h1::text").get()}
2728
for row in container.css("tr"):
2829
key, value = row.css("td")
2930
value_parts = value.css("*::text").extract()
31+
cleaned = [part.strip() for part in value_parts]
3032
item[key.css("::text").extract_first().replace(":", "")] = (
31-
value_parts[0] if len(value_parts) == 1 else value_parts
33+
cleaned[0] if len(cleaned) == 1 else cleaned
3234
)
3335
item["id"] = item["Schulnummer"]
3436

@@ -39,9 +41,8 @@ def parse_school(self, response):
3941
yield item
4042

4143
def normalize(self, item: Item) -> School:
42-
zip, city = item.get("Anschrift")[-1].split("\xa0")
43-
email_parts = item.get("E-Mail")
44-
email = email_parts[0].replace("(at)", "@") + email_parts[2]
44+
zip, city = item.get("Anschrift")[-1].rsplit(" ")
45+
email = item.get("E-Mail", "").replace("(at)", "@")
4546
return School(
4647
name=item.get("name"),
4748
id="RP-{}".format(item.get("id")),

0 commit comments

Comments
 (0)