@@ -11,24 +11,25 @@ class RheinlandPfalzSpider(CrawlSpider, SchoolSpider):
1111 name = "rheinland-pfalz"
1212 # Note, one could also use the geo portal:
1313 # https://www.geoportal.rlp.de/spatial-objects/350/collections/schulstandorte/items?f=html&limit=4000
14- start_urls = ["https://schulen. bildung-rp.de " ]
14+ start_urls = ["https://bildung.rlp.de/schulen " ]
1515 rules = [
1616 Rule (
17- LinkExtractor (allow = "https://schulen. bildung-rp. de/einzelanzeige.html? .*" ),
17+ LinkExtractor (allow = "https://bildung.rlp. de/schulen/ einzelanzeige.*" ),
1818 callback = "parse_school" ,
1919 follow = False ,
2020 )
2121 ]
2222
2323 # get the information
2424 def parse_school (self , response ):
25- container = response .css ("#wfqbeResults " )
25+ container = response .css (".rlp-schooldatabase-detail " )
2626 item = {"name" : container .css ("h1::text" ).get ()}
2727 for row in container .css ("tr" ):
2828 key , value = row .css ("td" )
2929 value_parts = value .css ("*::text" ).extract ()
30+ cleaned = [part .strip () for part in value_parts ]
3031 item [key .css ("::text" ).extract_first ().replace (":" , "" )] = (
31- value_parts [0 ] if len (value_parts ) == 1 else value_parts
32+ cleaned [0 ] if len (cleaned ) == 1 else cleaned
3233 )
3334 item ["id" ] = item ["Schulnummer" ]
3435
@@ -39,9 +40,13 @@ def parse_school(self, response):
3940 yield item
4041
4142 def normalize (self , item : Item ) -> School :
42- zip , city = item .get ("Anschrift" )[- 1 ].split ( " \xa0 " )
43+ zip , city = item .get ("Anschrift" )[- 1 ].rsplit ( " " )
4344 email_parts = item .get ("E-Mail" )
44- email = email_parts [0 ].replace ("(at)" , "@" ) + email_parts [2 ]
45+ email = (
46+ email_parts [0 ].replace ("(at)" , "@" ) + email_parts [2 ]
47+ if email_parts is not None
48+ else None
49+ )
4550 return School (
4651 name = item .get ("name" ),
4752 id = "RP-{}" .format (item .get ("id" )),
0 commit comments