From 476d8f06bba9f1d3228811c2479c2ec4cdc1115e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Knut=20Hu=CC=88hne?= Date: Tue, 21 Jan 2025 20:24:37 +0100 Subject: [PATCH 1/2] [MV] Remove fractional zeroes in ids Apparently some of the value were returned as floats where they should have been ints --- jedeschule/spiders/mecklenburg_vorpommern.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/jedeschule/spiders/mecklenburg_vorpommern.py b/jedeschule/spiders/mecklenburg_vorpommern.py index d8e42c8..313db18 100644 --- a/jedeschule/spiders/mecklenburg_vorpommern.py +++ b/jedeschule/spiders/mecklenburg_vorpommern.py @@ -6,6 +6,13 @@ from jedeschule.spiders.school_spider import SchoolSpider +def as_string(value: str): + try: + return str(int(value)) + except ValueError: + return value + + class MecklenburgVorpommernSpider(SchoolSpider): name = "mecklenburg-vorpommern" # The state provides the data as an Excel file. The current year's @@ -32,10 +39,10 @@ def parse(self, response): def normalize(item: Item) -> School: return School( name=item.get("NAME1"), - id="MV-{}".format(item.get("DIENSTSTELLEN-NUMMER")), + id="MV-{}".format(as_string(item.get("DIENSTSTELLEN-NUMMER"))), address=item.get("STRASSE"), address2="", - zip=item.get("PLZ"), + zip=as_string(item.get("PLZ")).zfill(5), city=item.get("ORT"), website=item.get("INTERNET"), email=item.get("E-MAIL-ADRESSE"), From 106ddb0ac3b6c286db5c83ed0ac86d973037f554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Knut=20Hu=CC=88hne?= Date: Tue, 21 Jan 2025 20:25:04 +0100 Subject: [PATCH 2/2] [MV] Use Excel file from 2023/2024 --- jedeschule/spiders/mecklenburg_vorpommern.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jedeschule/spiders/mecklenburg_vorpommern.py b/jedeschule/spiders/mecklenburg_vorpommern.py index 313db18..8baa626 100644 --- a/jedeschule/spiders/mecklenburg_vorpommern.py +++ b/jedeschule/spiders/mecklenburg_vorpommern.py @@ -22,7 +22,7 @@ class MecklenburgVorpommernSpider(SchoolSpider): # https://www.statistischebibliothek.de/mir/receive/MVSerie_mods_00000396 # Official documentation on all available data here: # https://www.laiv-mv.de/Statistik/Veröffentlichungen/Verzeichnisse/ - base_url = "https://www.statistischebibliothek.de/mir/servlets/MCRFileNodeServlet/MVHeft_derivate_00006849/V034%202022%2000.xlsx" + base_url = "https://www.statistischebibliothek.de/mir/servlets/MCRFileNodeServlet/MVHeft_derivate_00007470/V044%202023%2000.xlsx" start_urls = [base_url] def parse(self, response):