11from scrapy import Item
22from scrapy .linkextractors import LinkExtractor
3+ import re
34
45from scrapy .spiders import CrawlSpider , Rule
56
@@ -11,24 +12,25 @@ class RheinlandPfalzSpider(CrawlSpider, SchoolSpider):
1112 name = "rheinland-pfalz"
1213 # Note, one could also use the geo portal:
1314 # https://www.geoportal.rlp.de/spatial-objects/350/collections/schulstandorte/items?f=html&limit=4000
14- start_urls = ["https://schulen. bildung-rp.de " ]
15+ start_urls = ["https://bildung.rlp.de/schulen " ]
1516 rules = [
1617 Rule (
17- LinkExtractor (allow = "https://schulen. bildung-rp. de/einzelanzeige.html? .*" ),
18+ LinkExtractor (allow = "https://bildung.rlp. de/schulen/ einzelanzeige.*" ),
1819 callback = "parse_school" ,
1920 follow = False ,
2021 )
2122 ]
2223
2324 # get the information
2425 def parse_school (self , response ):
25- container = response .css ("#wfqbeResults " )
26+ container = response .css (".rlp-schooldatabase-detail " )
2627 item = {"name" : container .css ("h1::text" ).get ()}
2728 for row in container .css ("tr" ):
2829 key , value = row .css ("td" )
2930 value_parts = value .css ("*::text" ).extract ()
31+ cleaned = [part .strip () for part in value_parts ]
3032 item [key .css ("::text" ).extract_first ().replace (":" , "" )] = (
31- value_parts [0 ] if len (value_parts ) == 1 else value_parts
33+ cleaned [0 ] if len (cleaned ) == 1 else cleaned
3234 )
3335 item ["id" ] = item ["Schulnummer" ]
3436
@@ -39,9 +41,8 @@ def parse_school(self, response):
3941 yield item
4042
4143 def normalize (self , item : Item ) -> School :
42- zip , city = item .get ("Anschrift" )[- 1 ].split ("\xa0 " )
43- email_parts = item .get ("E-Mail" )
44- email = email_parts [0 ].replace ("(at)" , "@" ) + email_parts [2 ]
44+ zip , city = item .get ("Anschrift" )[- 1 ].rsplit (" " )
45+ email = item .get ("E-Mail" , "" ).replace ("(at)" , "@" )
4546 return School (
4647 name = item .get ("name" ),
4748 id = "RP-{}" .format (item .get ("id" )),
0 commit comments