|
1 | | -import time |
2 | | -import json |
3 | | - |
| 1 | +import re |
4 | 2 | import scrapy |
5 | 3 | from scrapy import Item |
6 | 4 |
|
7 | 5 | from jedeschule.spiders.school_spider import SchoolSpider |
8 | 6 | from jedeschule.items import School |
9 | 7 |
|
10 | 8 |
|
| 9 | +# Pattern to extract DISCH (8-digit school ID) from Baden-Württemberg email addresses |
| 10 | +DISCH_RE = re.compile(r'@(\d{8})\.schule\.bwl\.de', re.IGNORECASE) |
| 11 | + |
| 12 | + |
| 13 | +def extract_disch(email: str | None) -> str | None: |
| 14 | + """ |
| 15 | + Extract 8-digit DISCH (Dienststellenschlüssel) from BW school email address. |
| 16 | +
|
| 17 | + Args: |
| 18 | + email: Email address, typically in format poststelle@{DISCH}.schule.bwl.de |
| 19 | +
|
| 20 | + Returns: |
| 21 | + 8-digit DISCH string if found, None otherwise |
| 22 | +
|
| 23 | + Example: |
| 24 | + >>> extract_disch("[email protected]") |
| 25 | + '04144952' |
| 26 | + >>> extract_disch("[email protected]") |
| 27 | + None |
| 28 | + """ |
| 29 | + if not email: |
| 30 | + return None |
| 31 | + |
| 32 | + match = DISCH_RE.search(email.strip()) |
| 33 | + return match.group(1) if match else None |
| 34 | + |
| 35 | + |
11 | 36 | class BadenWuerttembergSpider(SchoolSpider): |
12 | 37 | name = "baden-wuerttemberg" |
13 | | - url = "https://lobw.kultus-bw.de/didsuche/" |
14 | | - start_urls = [url] |
15 | 38 |
|
16 | | - # click the search button to return all results |
| 39 | + start_urls = [ |
| 40 | + "https://gis.kultus-bw.de/geoserver/us-govserv/ows?" |
| 41 | + "service=WFS&request=GetFeature&" |
| 42 | + "typeNames=us-govserv%3AGovernmentalService&" |
| 43 | + "outputFormat=application%2Fjson" |
| 44 | + ] |
| 45 | + |
17 | 46 | def parse(self, response): |
18 | | - links_url = "https://lobw.kultus-bw.de/didsuche/DienststellenSucheWebService.asmx/SearchDienststellen" |
19 | | - timestamp = str(int(time.time())) |
20 | | - body = { |
21 | | - "command": "QUICKSEARCH", |
22 | | - "data": { |
23 | | - "dscSearch": "", |
24 | | - "dscPlz": "", |
25 | | - "dscOrt": "", |
26 | | - "dscDienststellenname": "", |
27 | | - "dscSchulartenSelected": "", |
28 | | - "dscSchulstatusSelected": "", |
29 | | - "dscSchulaufsichtSelected": "", |
30 | | - "dscOrtSelected": "", |
31 | | - "dscEntfernung": "", |
32 | | - "dscAusbildungsSchulenSelected": "", |
33 | | - "dscAusbildungsSchulenSelectedSart": "", |
34 | | - "dscPageNumber": "1", |
35 | | - "dscPageSize": "10000", # crawl at least the number of existing schools |
36 | | - "dscUnique": timestamp, |
37 | | - }, |
38 | | - } |
39 | | - payload = json.dumps({"json": str(body)}) |
40 | | - req = scrapy.Request( |
41 | | - links_url, |
42 | | - method="POST", |
43 | | - body=payload, |
44 | | - headers={ |
45 | | - "Content-Type": "application/json", |
46 | | - "Host": "lobw.kultus-bw.de", |
47 | | - "Connection": "keep-alive", |
48 | | - "Accept": "application/json, text/javascript, */*; q=0.01", |
49 | | - "Origin": "https://lobw.kultus-bw.de", |
50 | | - "Referer": "https://lobw.kultus-bw.de/didsuche/", |
51 | | - }, |
52 | | - callback=self.parse_schoolist, |
53 | | - ) |
54 | | - yield req |
55 | | - |
56 | | - # go on each schools details side |
57 | | - def parse_schoolist(self, response): |
58 | | - school_data_url = "https://lobw.kultus-bw.de/didsuche/DienststellenSucheWebService.asmx/GetDienststelle" |
59 | | - items = json.loads(json.loads(response.text)["d"])["Rows"] |
60 | | - for item in items: |
61 | | - disch = item["DISCH"][1:-1] # remove '' |
62 | | - payload = json.dumps({"disch": disch}) |
63 | | - req = scrapy.Request( |
64 | | - school_data_url, |
65 | | - method="POST", |
66 | | - body=payload, |
67 | | - headers={ |
68 | | - "Content-Type": "application/json", |
69 | | - "Host": "lobw.kultus-bw.de", |
70 | | - "Connection": "keep-alive", |
71 | | - "Accept": "application/json, text/javascript, */*; q=0.01", |
72 | | - "Origin": "https://lobw.kultus-bw.de", |
73 | | - "Referer": "https://lobw.kultus-bw.de/didsuche/", |
74 | | - }, |
75 | | - callback=self.parse_school_data, |
| 47 | + """Parse WFS GeoJSON response""" |
| 48 | + data = response.json() |
| 49 | + |
| 50 | + for feature in data.get("features", []): |
| 51 | + uuid = feature.get("id") |
| 52 | + props = feature["properties"] |
| 53 | + |
| 54 | + # Extract coordinates |
| 55 | + service_loc = props.get("serviceLocation", {}) |
| 56 | + geom = service_loc.get("serviceLocationByGeometry", {}) |
| 57 | + coords = geom.get("coordinates") |
| 58 | + |
| 59 | + # Note: BW WFS returns [latitude, longitude] (non-standard!) |
| 60 | + lat = coords[0] if coords and len(coords) >= 2 else None |
| 61 | + lon = coords[1] if coords and len(coords) >= 2 else None |
| 62 | + |
| 63 | + # Extract contact and address info |
| 64 | + contact = props.get("pointOfContact", {}).get("Contact", {}) |
| 65 | + addr_repr = contact.get("address", {}).get("AddressRepresentation", {}) |
| 66 | + |
| 67 | + # School name |
| 68 | + locator_name = addr_repr.get("locatorName", {}) |
| 69 | + name_spelling = locator_name.get("spelling", {}) |
| 70 | + name = ( |
| 71 | + name_spelling.get("text", "") if isinstance(name_spelling, dict) else "" |
| 72 | + ) |
| 73 | + |
| 74 | + # Street |
| 75 | + thoroughfare = addr_repr.get("thoroughfare", {}) |
| 76 | + if isinstance(thoroughfare, dict): |
| 77 | + street_obj = thoroughfare.get("GeographicalName", {}).get( |
| 78 | + "spelling", {} |
| 79 | + ) |
| 80 | + street = ( |
| 81 | + street_obj.get("text", "").strip() |
| 82 | + if isinstance(street_obj, dict) |
| 83 | + else "" |
| 84 | + ) |
| 85 | + else: |
| 86 | + street = "" |
| 87 | + |
| 88 | + # House number |
| 89 | + locator = addr_repr.get("locatorDesignator", "").strip() |
| 90 | + |
| 91 | + # Full address |
| 92 | + address = f"{street} {locator}".strip() if street else None |
| 93 | + |
| 94 | + # ZIP code |
| 95 | + zip_code = addr_repr.get("postCode", "").strip() |
| 96 | + |
| 97 | + # City |
| 98 | + post_name = addr_repr.get("postName", {}) |
| 99 | + city_obj = post_name.get("GeographicalName", {}) |
| 100 | + city_spelling = city_obj.get("spelling", {}) |
| 101 | + city = ( |
| 102 | + city_spelling.get("text", "").strip() |
| 103 | + if isinstance(city_spelling, dict) |
| 104 | + else "" |
76 | 105 | ) |
77 | | - yield req |
78 | | - |
79 | | - # get the information |
80 | | - def parse_school_data(self, response): |
81 | | - item = json.loads(json.loads(response.text)["d"]) |
82 | | - data = { |
83 | | - "name": self.fix_data(item["NAME"]), |
84 | | - "id": self.fix_data(item["DISCH"]), |
85 | | - "Strasse": self.fix_data(item["DISTR"]), |
86 | | - "PLZ": self.fix_data(item["PLZSTR"]), |
87 | | - "Ort": self.fix_data(item["DIORT"]), |
88 | | - "Telefon": self.fix_data(item["TELGANZ"]), |
89 | | - "Fax": self.fix_data(item["FAXGANZ"]), |
90 | | - "E-Mail": self.fix_data(item["VERWEMAIL"]), |
91 | | - "Internet": self.fix_data(item["INTERNET"]), |
92 | | - "Schulamt": self.fix_data(item["UEBERGEORDNET"]), |
93 | | - "Schulamt_Website": self.fix_data(item["UEBERGEORDNET_INTERNET"]), |
94 | | - "Kreis": self.fix_data(item["KREISBEZEICHNUNG"]), |
95 | | - "Schulleitung": self.fix_data(item["SLFAMVOR"]), |
96 | | - "Schulträger": self.fix_data(item["STR_KURZ_BEZEICHNUNG"]), |
97 | | - "Postfach": self.fix_data(item["PFACH"]), |
98 | | - "PLZ_Postfach": self.fix_data(item["PLZPFACH"]), |
99 | | - "Schueler": item["SCHUELER"], |
100 | | - "Klassen": item["KLASSEN"], |
101 | | - "Lehrer": item["LEHRER"], |
102 | | - } |
103 | | - yield data |
104 | | - |
105 | | - # fix wrong tabs, spaces and new lines |
106 | | - def fix_data(self, string): |
107 | | - if string: |
108 | | - string = " ".join(string.split()) |
109 | | - string.replace("\n", "") |
110 | | - return string |
111 | | - |
112 | | - def normalize(self, item: Item) -> School: |
| 106 | + |
| 107 | + # Contact info |
| 108 | + email = contact.get("electronicMailAddress", "") |
| 109 | + phone = contact.get("telephoneVoice", "") |
| 110 | + fax = contact.get("telephoneFacsimile", "") |
| 111 | + website = contact.get("website", "") |
| 112 | + |
| 113 | + # Extract DISCH from email (if available) |
| 114 | + disch = extract_disch(email) |
| 115 | + |
| 116 | + # Service type (school type) |
| 117 | + service_type = props.get("serviceType", {}).get("@href", "") |
| 118 | + |
| 119 | + item = { |
| 120 | + "uuid": uuid, |
| 121 | + "disch": disch, # Store in raw for reference |
| 122 | + "name": name, |
| 123 | + "address": address, |
| 124 | + "zip": zip_code, |
| 125 | + "city": city, |
| 126 | + "email": email, |
| 127 | + "phone": phone, |
| 128 | + "fax": fax, |
| 129 | + "website": website if website else None, |
| 130 | + "school_type": service_type, |
| 131 | + "lat": lat, |
| 132 | + "lon": lon, |
| 133 | + } |
| 134 | + |
| 135 | + yield item |
| 136 | + |
| 137 | + @staticmethod |
| 138 | + def normalize(item: Item) -> School: |
| 139 | + # Prefer DISCH (stable government ID) over UUID when available |
| 140 | + disch = item.get("disch") |
| 141 | + uuid = item.get("uuid") |
| 142 | + school_id = f"BW-{disch}" if disch else f"BW-UUID-{uuid}" |
| 143 | + |
113 | 144 | return School( |
| 145 | + id=school_id, |
114 | 146 | name=item.get("name"), |
115 | | - id="BW-{}".format(item.get("id")), |
116 | | - address=item.get("Strasse"), |
117 | | - zip=item.get("PLZ"), |
118 | | - city=item.get("Ort"), |
119 | | - website=item.get("Internet"), |
120 | | - email=item.get("E-Mail"), |
121 | | - fax=item.get("Fax"), |
122 | | - phone=item.get("Telefon"), |
123 | | - provider=item.get("Schulamt"), |
124 | | - director=item.get("Schulleitung"), |
125 | | - school_type="", |
| 147 | + address=item.get("address"), |
| 148 | + zip=item.get("zip"), |
| 149 | + city=item.get("city"), |
| 150 | + email=item.get("email"), |
| 151 | + phone=item.get("phone"), |
| 152 | + fax=item.get("fax"), |
| 153 | + website=item.get("website"), |
| 154 | + school_type=item.get("school_type"), |
| 155 | + latitude=item.get("lat"), |
| 156 | + longitude=item.get("lon"), |
126 | 157 | ) |
0 commit comments