Skip to content

Commit b7717f6

Browse files
committed
Merge branch 'main' into xlsx-output
2 parents 36ced9c + 1197e65 commit b7717f6

File tree

1 file changed

+57
-45
lines changed

1 file changed

+57
-45
lines changed

scraper.py

Lines changed: 57 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
9494
{"match": "27991 Buena Vista Blvd.", "replace": "27991 BUENA VISTA BOULEVARD", "locality": "Los Fresnos"},
9595
{"match": "175 Pike County Blvd.", "replace": "175 PIKE COUNTY BOULEVARD", "locality": "Lords Valley"},
9696
{"match": "500 W. 2nd Street", "replace": "301 W. 2nd", "locality": "Rolla"},
97-
{"match": "307 Saint Joseph St", "replace": "300 KANSAS CITY STREET NONE", "locality": "Rapid City"},
9897
{"match": "3405 West Highway 146", "replace": "3405 W HWY 146", "locality": "LaGrange"},
9998
{"match": "1623 E J Street, Suite 2", "replace": "1623 E. J STREET", "locality": "Tacoma"},
10099
{"match": "1805 W 32nd Street", "replace": "1805 W 32ND ST", "locality": "Baldwin"},
@@ -106,7 +105,6 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
106105
{"match": "2190 E Mesquite Avenue", "replace": "2190 EAST MESQUITE AVENUE", "locality": "Pahrump"},
107106
{"match": "287 Industrial Drive", "replace": "327 INDUSTRIAL DRIVE", "locality": "Jonesboro"},
108107
{"match": "1572 Gateway Road", "replace": "1572 GATEWAY", "locality": "Calexico"},
109-
{"match": "203 Aspinall Avenue", "replace": "203 ASPINAL AVE. PO BOX 3236", "locality": "Hagatna"},
110108
{"match": "1199 N Haseltine Road", "replace": "1199 N HASELTINE RD", "locality": "Springfield"},
111109
{"match": "1701 North Washington", "replace": "1701 NORTH WASHINGTON ST", "locality": "Grand Forks"},
112110
{"match": "611 Frontage Road", "replace": "611 FRONTAGE RD", "locality": "McFarland"},
@@ -126,7 +124,7 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
126124
{"match": "704 E Broadway Street", "replace": "702 E BROADWAY ST", "locality": "Eden"},
127125
{"match": "1300 E Hwy 107", "replace": "1330 HIGHWAY 107", "locality": "La Villa"},
128126
{"match": "216 W. Center Street", "replace": "215 WEST CENTRAL STREET", "locality": "Juneau"},
129-
{"match": "300 El Racho Way ", "replace": "300 EL RANCHO WAY", "locality": "Dilley"},
127+
{"match": "300 El Rancho Way ", "replace": "300 EL RANCHO WAY", "locality": "Dilley"},
130128
{"match": "3130 North Oakland Street", "replace": "3130 OAKLAND ST", "locality": "Aurora"},
131129
{"match": "03151 Co. Rd. 24.2", "replace": "3151 ROAD 2425 ROUTE 1", "locality": "Stryker"},
132130
{"match": "20 Hobo Forks Road", "replace": "20 HOBO FORK RD", "locality": "Natchez"},
@@ -145,22 +143,37 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
145143
"locality": "Bowling Green",
146144
},
147145
{"match": "58 Pine Mountain Road", "replace": "58 PINE MOUNTAIN RD", "locality": "McElhattan"},
146+
{
147+
"match": "Adelanto East 10400 Rancho Road | Adelanto West 10250 Rancho Road",
148+
"replace": "10250 Rancho Road",
149+
"locality": "Adelanto",
150+
},
151+
{"match": "4702 East Saunders", "replace": "4702 EAST SAUNDERS STREET", "locality": "Laredo"},
152+
{"match": "9998 S. Highway 98", "replace": "9998 SOUTH HIGHWAY 83", "locality": "Laredo"},
148153
# a unique one, 'cause the PHONE NUMBER IS IN THE ADDRESS?!
149154
{"match": "911 PARR BLVD 775 328 3308", "replace": "911 E Parr Blvd", "locality": "RENO"},
155+
# fix a few shockingly bad addresses in spreadsheet
156+
{"match": "DEPARTMENT OF CORRECTIONS 1618 ASH STREET", "replace": "1618 Ash Street", "locality": "ERIE"},
157+
{"match": "203 ASPINAL AVE. PO BOX 3236", "replace": "203 Aspinall Avenue", "locality": "HAGATNA"},
158+
{
159+
"match": "11866 HASTINGS BRIDGE ROAD P.O. BOX 429",
160+
"replace": "11866 Hastings Bridge Road",
161+
"locality": "LOVEJOY",
162+
},
163+
{"match": "300 KANSAS CITY STREET NONE", "replace": "307 Saint Joseph St", "locality": "RAPID CITY"},
164+
{"match": "4909 FM 2826", "replace": "4909 Farm to Market Road", "locality": "ROBSTOWN"},
165+
{"match": "6920 DIGITAL RD", "replace": "11541 Montana Avenue", "locality": "EL PASO"},
150166
# default matches should come last
151167
{"match": "'s", "replace": "", "locality": ""},
152168
{"match": ".", "replace": "", "locality": ""},
153169
{"match": ",", "replace": "", "locality": ""},
154170
]
155-
stripped_street = street
156171
cleaned = False
157-
if any(f["match"] in stripped_street for f in street_filters):
158-
cleaned = True
159172
for f in street_filters:
160-
if (f["match"] in stripped_street) and ((f["locality"] and f["locality"] == locality) or not f["locality"]):
161-
stripped_street = stripped_street.replace(f["match"], f["replace"])
173+
if (f["match"] in street) and ((f["locality"] and f["locality"] == locality) or not f["locality"]):
174+
street = street.replace(f["match"], f["replace"])
162175
cleaned = True
163-
return stripped_street, cleaned
176+
return street, cleaned
164177

165178
def _repair_zip(self, zip_code: int, locality: str) -> Tuple[str, bool]:
166179
"""
@@ -172,22 +185,21 @@ def _repair_zip(self, zip_code: int, locality: str) -> Tuple[str, bool]:
172185
if len(zcode) == 4:
173186
zcode = f"0{zcode}"
174187
cleaned = True
175-
# This address is an absolute mess
176-
if zcode == "89512" and locality == "Reno":
177-
zcode = "89506"
178-
cleaned = True
179-
if zcode == "82901" and locality == "Rock Springs":
180-
zcode = "82935"
181-
cleaned = True
182-
if zcode == "98421-1615" and locality == "Tacoma":
183-
zcode = "98421"
184-
cleaned = True
185-
if zcode == "89048" and locality == "Pahrump":
186-
zcode = "89060"
187-
cleaned = True
188-
if zcode == "85132" and locality == "Florence":
189-
zcode = "85232"
190-
cleaned = True
188+
matches = [
189+
{"match": "89512", "replace": "89506", "locality": "Reno"},
190+
{"match": "82901", "replace": "82935", "locality": "Rock Springs"},
191+
{"match": "98421-1615", "replace": "98421", "locality": "Tacoma"},
192+
{"match": "89048", "replace": "89060", "locality": "Pahrump"},
193+
{"match": "85132", "replace": "85232", "locality": "Florence"},
194+
# Laredo facility addresses are particularly bad...
195+
{"match": "78041", "replace": "78401", "locality": "LAREDO"},
196+
{"match": "78401", "replace": "78046", "locality": "LAREDO"},
197+
]
198+
for z in matches:
199+
if z["match"] == zcode and z["locality"] == locality:
200+
zcode = z["replace"]
201+
cleaned = True
202+
break
191203
return zcode, cleaned
192204

193205
def _repair_locality(self, locality: str, administrative_area: str) -> Tuple[str, bool]:
@@ -196,21 +208,18 @@ def _repair_locality(self, locality: str, administrative_area: str) -> Tuple[str
196208
How the post office ever successfully delivered a letter is beyond me
197209
"""
198210
cleaned = False
199-
if locality == "LaGrange" and administrative_area == "KY":
200-
locality = "La Grange"
201-
cleaned = True
202-
if locality == "Leachfield" and administrative_area == "KY":
203-
locality = "LEITCHFIELD"
204-
cleaned = True
205-
if locality == "Susupe, Saipan" and administrative_area == "MP":
206-
locality = "SAIPAN"
207-
cleaned = True
208-
if locality == "Cottonwood Falls" and administrative_area == "KS":
209-
locality = "COTTONWOOD FALL"
210-
cleaned = True
211-
if locality == "Sault Ste. Marie" and administrative_area == "MI":
212-
locality = "SAULT STE MARIE"
213-
cleaned = True
211+
matches = [
212+
{"match": "LaGrange", "replace": "La Grange", "area": "KY"},
213+
{"match": "Leachfield", "replace": "LEITCHFIELD", "area": "KY"},
214+
{"match": "SAIPAN", "replace": "Susupe, Saipan", "area": "MP"},
215+
{"match": "COTTONWOOD FALL", "replace": "Cottonwood Falls", "area": "KS"},
216+
{"match": "Sault Ste. Marie", "replace": "SAULT STE MARIE", "area": "MI"},
217+
]
218+
for f in matches:
219+
if f["match"] == locality and f["area"] == administrative_area:
220+
locality = f["replace"]
221+
cleaned = True
222+
break
214223
return locality, cleaned
215224

216225
def _load_sheet(self) -> dict:
@@ -240,14 +249,14 @@ def _load_sheet(self) -> dict:
240249
if match:
241250
details["phone"] = match.group(1)
242251
details["_repaired_record"] = True
243-
full_address = ",".join([street, row["City"], row["State"], zcode]).upper()
244-
details["address"]["administrative_area"] = row["State"]
245252
locality, cleaned = self._repair_locality(row["City"], row["State"])
246253
if cleaned:
247254
details["_repaired_record"] = True
248-
details["address"]["locality"] = row["City"]
249-
details["address"]["postal_code"] = row["Zip"]
250-
details["address"]["street"] = row["Address"]
255+
full_address = ",".join([street, locality, row["State"], zcode]).upper()
256+
details["address"]["administrative_area"] = row["State"]
257+
details["address"]["locality"] = locality
258+
details["address"]["postal_code"] = zcode
259+
details["address"]["street"] = street
251260
details["name"] = row["Name"]
252261
details["population"]["male"]["criminal"] = row["Male Crim"]
253262
details["population"]["male"]["non_criminal"] = row["Male Non-Crim"]
@@ -316,12 +325,15 @@ def scrape_facilities(self):
316325
addr = facility["address"]
317326
street, cleaned = self._clean_street(addr["street"], addr["locality"])
318327
if cleaned:
328+
addr["street"] = street
319329
facility["_repaired_record"] = True
320330
zcode, cleaned = self._repair_zip(addr["postal_code"], addr["locality"])
321331
if cleaned:
332+
addr["postal_code"] = zcode
322333
facility["_repaired_record"] = True
323334
locality, cleaned = self._repair_locality(addr["locality"], addr["administrative_area"])
324335
if cleaned:
336+
addr["locality"] = locality
325337
facility["_repaired_record"] = True
326338
full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper()
327339
if not facility["address_str"]:

0 commit comments

Comments
 (0)