Skip to content

Commit 7b63df1

Browse files
authored
Merge branch 'main' into osm-improvements
2 parents 38ebbf9 + 1197e65 commit 7b63df1

File tree

1 file changed

+56
-47
lines changed

1 file changed

+56
-47
lines changed

scraper.py

Lines changed: 56 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
9595
{"match": "27991 Buena Vista Blvd.", "replace": "27991 BUENA VISTA BOULEVARD", "locality": "Los Fresnos"},
9696
{"match": "175 Pike County Blvd.", "replace": "175 PIKE COUNTY BOULEVARD", "locality": "Lords Valley"},
9797
{"match": "500 W. 2nd Street", "replace": "301 W. 2nd", "locality": "Rolla"},
98-
{"match": "307 Saint Joseph St", "replace": "300 KANSAS CITY STREET NONE", "locality": "Rapid City"},
9998
{"match": "3405 West Highway 146", "replace": "3405 W HWY 146", "locality": "LaGrange"},
10099
{"match": "1623 E J Street, Suite 2", "replace": "1623 E. J STREET", "locality": "Tacoma"},
101100
{"match": "1805 W 32nd Street", "replace": "1805 W 32ND ST", "locality": "Baldwin"},
@@ -107,7 +106,6 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
107106
{"match": "2190 E Mesquite Avenue", "replace": "2190 EAST MESQUITE AVENUE", "locality": "Pahrump"},
108107
{"match": "287 Industrial Drive", "replace": "327 INDUSTRIAL DRIVE", "locality": "Jonesboro"},
109108
{"match": "1572 Gateway Road", "replace": "1572 GATEWAY", "locality": "Calexico"},
110-
{"match": "203 Aspinall Avenue", "replace": "203 ASPINAL AVE. PO BOX 3236", "locality": "Hagatna"},
111109
{"match": "1199 N Haseltine Road", "replace": "1199 N HASELTINE RD", "locality": "Springfield"},
112110
{"match": "1701 North Washington", "replace": "1701 NORTH WASHINGTON ST", "locality": "Grand Forks"},
113111
{"match": "611 Frontage Road", "replace": "611 FRONTAGE RD", "locality": "McFarland"},
@@ -127,7 +125,7 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
127125
{"match": "704 E Broadway Street", "replace": "702 E BROADWAY ST", "locality": "Eden"},
128126
{"match": "1300 E Hwy 107", "replace": "1330 HIGHWAY 107", "locality": "La Villa"},
129127
{"match": "216 W. Center Street", "replace": "215 WEST CENTRAL STREET", "locality": "Juneau"},
130-
{"match": "300 El Racho Way ", "replace": "300 EL RANCHO WAY", "locality": "Dilley"},
128+
{"match": "300 El Rancho Way ", "replace": "300 EL RANCHO WAY", "locality": "Dilley"},
131129
{"match": "3130 North Oakland Street", "replace": "3130 OAKLAND ST", "locality": "Aurora"},
132130
{"match": "03151 Co. Rd. 24.2", "replace": "3151 ROAD 2425 ROUTE 1", "locality": "Stryker"},
133131
{"match": "20 Hobo Forks Road", "replace": "20 HOBO FORK RD", "locality": "Natchez"},
@@ -146,22 +144,37 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
146144
"locality": "Bowling Green",
147145
},
148146
{"match": "58 Pine Mountain Road", "replace": "58 PINE MOUNTAIN RD", "locality": "McElhattan"},
147+
{
148+
"match": "Adelanto East 10400 Rancho Road | Adelanto West 10250 Rancho Road",
149+
"replace": "10250 Rancho Road",
150+
"locality": "Adelanto",
151+
},
152+
{"match": "4702 East Saunders", "replace": "4702 EAST SAUNDERS STREET", "locality": "Laredo"},
153+
{"match": "9998 S. Highway 98", "replace": "9998 SOUTH HIGHWAY 83", "locality": "Laredo"},
149154
# a unique one, 'cause the PHONE NUMBER IS IN THE ADDRESS?!
150155
{"match": "911 PARR BLVD 775 328 3308", "replace": "911 E Parr Blvd", "locality": "RENO"},
156+
# fix a few shockingly bad addresses in spreadsheet
157+
{"match": "DEPARTMENT OF CORRECTIONS 1618 ASH STREET", "replace": "1618 Ash Street", "locality": "ERIE"},
158+
{"match": "203 ASPINAL AVE. PO BOX 3236", "replace": "203 Aspinall Avenue", "locality": "HAGATNA"},
159+
{
160+
"match": "11866 HASTINGS BRIDGE ROAD P.O. BOX 429",
161+
"replace": "11866 Hastings Bridge Road",
162+
"locality": "LOVEJOY",
163+
},
164+
{"match": "300 KANSAS CITY STREET NONE", "replace": "307 Saint Joseph St", "locality": "RAPID CITY"},
165+
{"match": "4909 FM 2826", "replace": "4909 Farm to Market Road", "locality": "ROBSTOWN"},
166+
{"match": "6920 DIGITAL RD", "replace": "11541 Montana Avenue", "locality": "EL PASO"},
151167
# default matches should come last
152168
{"match": "'s", "replace": "", "locality": ""},
153169
{"match": ".", "replace": "", "locality": ""},
154170
{"match": ",", "replace": "", "locality": ""},
155171
]
156-
stripped_street = street
157172
cleaned = False
158-
if any(f["match"] in stripped_street for f in street_filters):
159-
cleaned = True
160173
for f in street_filters:
161-
if (f["match"] in stripped_street) and ((f["locality"] and f["locality"] == locality) or not f["locality"]):
162-
stripped_street = stripped_street.replace(f["match"], f["replace"])
174+
if (f["match"] in street) and ((f["locality"] and f["locality"] == locality) or not f["locality"]):
175+
street = street.replace(f["match"], f["replace"])
163176
cleaned = True
164-
return stripped_street, cleaned
177+
return street, cleaned
165178

166179
def _repair_zip(self, zip_code: int, locality: str) -> Tuple[str, bool]:
167180
"""
@@ -173,22 +186,21 @@ def _repair_zip(self, zip_code: int, locality: str) -> Tuple[str, bool]:
173186
if len(zcode) == 4:
174187
zcode = f"0{zcode}"
175188
cleaned = True
176-
# This address is an absolute mess
177-
elif zcode == "89512" and locality == "Reno":
178-
zcode = "89506"
179-
cleaned = True
180-
elif zcode == "82901" and locality == "Rock Springs":
181-
zcode = "82935"
182-
cleaned = True
183-
elif zcode == "98421-1615" and locality == "Tacoma":
184-
zcode = "98421"
185-
cleaned = True
186-
elif zcode == "89048" and locality == "Pahrump":
187-
zcode = "89060"
188-
cleaned = True
189-
elif zcode == "85132" and locality == "Florence":
190-
zcode = "85232"
191-
cleaned = True
189+
matches = [
190+
{"match": "89512", "replace": "89506", "locality": "Reno"},
191+
{"match": "82901", "replace": "82935", "locality": "Rock Springs"},
192+
{"match": "98421-1615", "replace": "98421", "locality": "Tacoma"},
193+
{"match": "89048", "replace": "89060", "locality": "Pahrump"},
194+
{"match": "85132", "replace": "85232", "locality": "Florence"},
195+
# Laredo facility addresses are particularly bad...
196+
{"match": "78041", "replace": "78401", "locality": "LAREDO"},
197+
{"match": "78401", "replace": "78046", "locality": "LAREDO"},
198+
]
199+
for z in matches:
200+
if z["match"] == zcode and z["locality"] == locality:
201+
zcode = z["replace"]
202+
cleaned = True
203+
break
192204
return zcode, cleaned
193205

194206
def _repair_locality(self, locality: str, administrative_area: str) -> Tuple[str, bool]:
@@ -197,21 +209,18 @@ def _repair_locality(self, locality: str, administrative_area: str) -> Tuple[str
197209
How the post office ever successfully delivered a letter is beyond me
198210
"""
199211
cleaned = False
200-
if locality == "LaGrange" and administrative_area == "KY":
201-
locality = "La Grange"
202-
cleaned = True
203-
elif locality == "Leachfield" and administrative_area == "KY":
204-
locality = "LEITCHFIELD"
205-
cleaned = True
206-
elif locality == "Susupe, Saipan" and administrative_area == "MP":
207-
locality = "SAIPAN"
208-
cleaned = True
209-
elif locality == "Cottonwood Falls" and administrative_area == "KS":
210-
locality = "COTTONWOOD FALL"
211-
cleaned = True
212-
elif locality == "Sault Ste. Marie" and administrative_area == "MI":
213-
locality = "SAULT STE MARIE"
214-
cleaned = True
212+
matches = [
213+
{"match": "LaGrange", "replace": "La Grange", "area": "KY"},
214+
{"match": "Leachfield", "replace": "LEITCHFIELD", "area": "KY"},
215+
{"match": "SAIPAN", "replace": "Susupe, Saipan", "area": "MP"},
216+
{"match": "COTTONWOOD FALL", "replace": "Cottonwood Falls", "area": "KS"},
217+
{"match": "Sault Ste. Marie", "replace": "SAULT STE MARIE", "area": "MI"},
218+
]
219+
for f in matches:
220+
if f["match"] == locality and f["area"] == administrative_area:
221+
locality = f["replace"]
222+
cleaned = True
223+
break
215224
return locality, cleaned
216225

217226
def _load_sheet(self) -> dict:
@@ -241,14 +250,14 @@ def _load_sheet(self) -> dict:
241250
if match:
242251
details["phone"] = match.group(1)
243252
details["_repaired_record"] = True
244-
full_address = ",".join([street, row["City"], row["State"], zcode]).upper()
245-
details["address"]["administrative_area"] = row["State"]
246253
locality, cleaned = self._repair_locality(row["City"], row["State"])
247254
if cleaned:
248255
details["_repaired_record"] = True
249-
details["address"]["locality"] = row["City"]
250-
details["address"]["postal_code"] = row["Zip"]
251-
details["address"]["street"] = row["Address"]
256+
full_address = ",".join([street, locality, row["State"], zcode]).upper()
257+
details["address"]["administrative_area"] = row["State"]
258+
details["address"]["locality"] = locality
259+
details["address"]["postal_code"] = zcode
260+
details["address"]["street"] = street
252261
details["name"] = row["Name"]
253262
details["population"]["male"]["criminal"] = row["Male Crim"]
254263
details["population"]["male"]["non_criminal"] = row["Male Non-Crim"]
@@ -325,11 +334,11 @@ def scrape_facilities(self):
325334
facility["_repaired_record"] = True
326335
zcode, cleaned = self._repair_zip(addr["postal_code"], addr["locality"])
327336
if cleaned:
328-
facility["postal_code"] = zcode
337+
addr["postal_code"] = zcode
329338
facility["_repaired_record"] = True
330339
locality, cleaned = self._repair_locality(addr["locality"], addr["administrative_area"])
331340
if cleaned:
332-
facility["locality"] = locality
341+
addr["locality"] = locality
333342
facility["_repaired_record"] = True
334343
full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper()
335344
if full_address in self.facilities_data["facilities"].keys():

0 commit comments

Comments
 (0)