Skip to content

Commit f9cae57

Browse files
authored
Merge pull request #99 from MaineDSA/fix_pricing_parser
fix parsing of costs
2 parents 7f4fe53 + 41c2267 commit f9cae57

12 files changed

+1212
-5207
lines changed

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ dependencies = [
88
"beautifulsoup4~=4.14.0",
99
"google-auth~=2.49.1",
1010
"gspread~=6.2.1",
11-
"patchright~=1.57.2",
11+
"patchright~=1.58.2",
1212
"python-dotenv~=1.2.0",
1313
"tqdm~=4.67.0",
1414
]
@@ -165,3 +165,7 @@ exclude_also = [
165165
'if __name__ == "__main__":',
166166
"main\\(\\)",
167167
]
168+
169+
[tool.mypy]
170+
local_partial_types = true
171+
strict_bytes = true

src/automation.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
MIN_SCROLL_UP,
2020
MIN_WAIT_TIME,
2121
PROBABILITY_SCROLL_UP,
22-
ZillowParseError,
2322
)
2423
from src.scraper import PropertyListing, ZillowHomeFinder
2524

@@ -68,13 +67,14 @@ async def get_browser_page(context: BrowserContext, *, require_new_page: bool =
6867
async def close_modal_if_present(page: Page) -> None:
6968
"""Close modal dialog by clicking button with class containing 'CloseButton', if present."""
7069
try:
70+
await page.wait_for_load_state()
7171
close_button = page.locator("button[class*='CloseButton']").first
7272
is_visible = await close_button.is_visible()
73-
if not is_visible:
74-
msg = "Popup modal blocked page loading, cannot scrape."
75-
raise ZillowParseError(msg)
76-
logger.debug("Popup modal detected, closing it")
77-
await close_button.click()
73+
if is_visible:
74+
logger.debug("Popup modal detected, attempting to close it")
75+
await close_button.click()
76+
return
77+
logger.warning("Invisible CloseButton modal found")
7878
except TimeoutError as e:
7979
logger.debug("No CloseButton modal found or could not close: %s", e)
8080

@@ -193,6 +193,7 @@ async def scroll_and_load_listings(page: Page, max_entries: int = 100, max_no_ch
193193
logger.debug("Lazy loading complete. Total property cards loaded: %s", final_count)
194194

195195
await scroll_to_top(page)
196+
await simulate_human_behavior(page)
196197

197198

198199
async def check_and_click_next_page(page: Page) -> bool:
@@ -246,6 +247,7 @@ async def sort_by_newest(page: Page) -> None:
246247

247248
await sort_button.click()
248249
await page.wait_for_load_state()
250+
await simulate_human_behavior(page)
249251

250252
newest_button = page.get_by_text("Newest")
251253
if not newest_button:

src/form_submission.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
from patchright.async_api import TimeoutError as PlaywrightTimeoutError
88
from tqdm import tqdm
99

10-
from src.constants import MAX_WAIT_TIME, MIN_WAIT_TIME, GoogleFormConstants
10+
from src.automation import simulate_human_behavior
11+
from src.constants import GoogleFormConstants
1112
from src.scraper import PropertyListing
1213

1314
logger = logging.getLogger(__name__)
@@ -17,7 +18,7 @@
1718
async def _submit_single_listing(page: Page, url: str, listing: PropertyListing) -> None:
1819
"""Submit a single listing to the Google Form."""
1920
await page.goto(url)
20-
await page.wait_for_timeout(cryptogen.randint(MIN_WAIT_TIME, MAX_WAIT_TIME))
21+
await simulate_human_behavior(page)
2122

2223
await page.fill(GoogleFormConstants.ADDRESS_INPUT_XPATH, listing.address)
2324
await page.fill(GoogleFormConstants.PRICE_INPUT_XPATH, listing.price)
@@ -31,7 +32,7 @@ async def _submit_single_listing(page: Page, url: str, listing: PropertyListing)
3132
error_msg = f"Form submission confirmation not received for {listing.address}"
3233
raise PlaywrightTimeoutError(error_msg) from e
3334

34-
await page.wait_for_timeout(cryptogen.randint(MIN_WAIT_TIME, MAX_WAIT_TIME))
35+
await simulate_human_behavior(page)
3536

3637

3738
async def submit_listings(page: Page, form_url: str, listings: list[PropertyListing]) -> None:

src/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ async def scrape_listings(context: BrowserContext, config: Config) -> list[Prope
3535
raise BaseException(error_msg)
3636

3737
await close_modal_if_present(page)
38+
await simulate_human_behavior(page)
3839

3940
logger.info("Scraping all listings...")
4041
await sort_by_newest(page)

src/scraper.py

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ def _parse_address(self) -> str:
4848

4949
def _parse_main_link(self) -> str:
5050
"""Extract main property link from property card."""
51-
link_element = self.card.find("a", class_="property-card-link", attrs={"data-test": "property-card-link"})
52-
if not isinstance(link_element, Tag):
51+
link_element = self.card.find("a", class_=re.compile("property.+-link"), attrs={"data-test": re.compile("property.+-link")})
52+
if not isinstance(link_element, Tag) or not link_element.get("href"):
5353
return ""
5454

5555
href = cast("str", link_element.get("href", "")).strip()
@@ -118,11 +118,11 @@ def _format_price_range(self, prices: list[str]) -> str | None:
118118

119119
def _get_units_count(self) -> int:
120120
"""Extract number of available units."""
121-
badge_area = self.card.find("div", class_=re.compile(r"StyledPropertyCardBadgeArea"))
121+
badge_area = self.card.find("div", attrs={"data-c11n-component": "PropertyCard.BadgeArea"})
122122
if not badge_area or isinstance(badge_area, NavigableString):
123123
return 1
124124

125-
badges = badge_area.find_all("span", class_=re.compile(r"StyledPropertyCardBadge"))
125+
badges = badge_area.find_all("span", attrs={"data-c11n-component": "PropertyCard.Badge"})
126126
for badge in badges:
127127
badge_text = badge.get_text(strip=True).lower()
128128
unit_match = self._PATTERN_UNIT_COUNT.search(badge_text)
@@ -151,7 +151,23 @@ def _get_main_price_listings(self) -> list[PropertyListing]:
151151
if not main_price_element:
152152
return []
153153

154-
price_text = self._clean_price_text(main_price_element.get_text(strip=True))
154+
# The price container can contains multiple spans like "Fees may apply".
155+
# get_text() on the outer span concatenates them without spaces, corrupting
156+
# the price text (e.g. "$1,608+ 2 bdsFees may apply"). Instead, grab only
157+
# the first nested span which contains the actual price string.
158+
inner_span = main_price_element.find("span")
159+
if inner_span and not isinstance(inner_span, Tag):
160+
msg = f"inner_span type is incorrect: {type(inner_span)}"
161+
raise TypeError(msg)
162+
163+
price_span = inner_span.find("span") if inner_span else None
164+
if price_span and not isinstance(price_span, Tag):
165+
msg = f"price_span type is incorrect: {type(price_span)}"
166+
raise TypeError(msg)
167+
168+
raw_text = (price_span or inner_span or main_price_element).get_text(strip=True)
169+
170+
price_text = self._clean_price_text(raw_text)
155171
if not price_text:
156172
return []
157173

@@ -170,22 +186,32 @@ def _get_inventory_listings(self) -> list[PropertyListing]:
170186
if not inventory_section or isinstance(inventory_section, NavigableString):
171187
return []
172188

173-
# Extract price and bedroom data
174-
price_elements = inventory_section.find_all("span", class_=re.compile(r"PriceText"))
175-
bed_elements = inventory_section.find_all("span", class_=re.compile(r"BedText"))
189+
price_bed_pairs: list[tuple[str, str, str]] = [] # (price, bed_info, link)
190+
for anchor in inventory_section.find_all("a"):
191+
box = anchor.find("div", attrs={"data-testid": "PropertyCardInventoryBox"})
192+
if not box or isinstance(box, NavigableString):
193+
continue
194+
195+
spans = box.find_all("span")
196+
if not spans:
197+
continue
198+
199+
price_text = self._clean_price_text(spans[0].get_text(strip=True))
200+
if not price_text:
201+
continue
202+
203+
bed_info = spans[1].get_text(strip=True) if len(spans) > 1 else ""
204+
205+
href = cast("str", anchor.get("href", "")).strip()
206+
link = href if href.startswith("http") else f"https://www.zillow.com{href}"
176207

177-
price_bed_pairs = []
178-
for i, price_elem in enumerate(price_elements):
179-
price_text = self._clean_price_text(price_elem.get_text(strip=True))
180-
if price_text:
181-
bed_info = bed_elements[i].get_text(strip=True) if i < len(bed_elements) else ""
182-
price_bed_pairs.append((price_text, bed_info))
208+
price_bed_pairs.append((price_text, bed_info, link))
183209

184210
units_count = self._get_units_count()
185211

186212
# Handle multiple units with price range
187213
if units_count > 1 and len(price_bed_pairs) > 1:
188-
prices = [price for price, _ in price_bed_pairs]
214+
prices = [price for price, _, __ in price_bed_pairs]
189215
price_range = cast("str", self._format_price_range(prices))
190216

191217
# Calculate median of the range
@@ -199,15 +225,14 @@ def _get_inventory_listings(self) -> list[PropertyListing]:
199225

200226
# Create individual listings
201227
listings = []
202-
for price, bed_info in price_bed_pairs:
228+
for price, bed_info, link in price_bed_pairs:
203229
address = self.address + (f" ({bed_info})" if bed_info else "")
204-
specific_link = self._create_specific_link(bed_info)
205230

206231
# For individual listings, median is same as price
207232
numeric_price = self._extract_numeric_price(price)
208233
median_price = str(numeric_price) if numeric_price else price
209234

210-
listings.append(PropertyListing(address, price, median_price, specific_link))
235+
listings.append(PropertyListing(address, price, median_price, link))
211236

212237
return listings
213238

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
def zillow_search_page_html() -> str:
99
"""Load the vendored Zillow search results page as html text."""
1010
html_example_folder = Path("tests/vendored")
11-
return (html_example_folder / "zillow-search-boston-20251128-1.html").read_text(encoding="utf-8")
11+
return (html_example_folder / "zillow-search-boston-20260402-1.html").read_text(encoding="utf-8")
1212

1313

1414
@pytest.fixture

tests/test_form_submission.py

Lines changed: 5 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def wait_for_selector_side_effect(*_, **__) -> AsyncMock:
5353
mock_page.wait_for_selector.side_effect = wait_for_selector_side_effect
5454
form_url = "https://example.com/form"
5555

56-
with caplog.at_level("INFO"), patch("src.form_submission.cryptogen.randint", return_value=250):
56+
with caplog.at_level("INFO"), patch("src.automation.cryptogen.randint", return_value=250):
5757
await submit_listings(mock_page, form_url, sample_listings)
5858

5959
assert mock_page.goto.call_count == 3
@@ -67,7 +67,7 @@ async def test_submit_listings_all_succeed(caplog: LogCaptureFixture, sample_lis
6767
mock_page = AsyncMock()
6868
form_url = "https://example.com/form"
6969

70-
with caplog.at_level("INFO"), patch("src.form_submission.cryptogen.randint", return_value=250):
70+
with caplog.at_level("INFO"), patch("src.automation.cryptogen.randint", return_value=250):
7171
await submit_listings(mock_page, form_url, sample_listings)
7272

7373
assert "3 successful, 0 failed" in caplog.text
@@ -82,7 +82,7 @@ async def test_submit_listings_all_fail(caplog: LogCaptureFixture, sample_listin
8282
mock_page.wait_for_selector.side_effect = PlaywrightTimeoutError("Timeout")
8383
form_url = "https://example.com/form"
8484

85-
with caplog.at_level("INFO"), patch("src.form_submission.cryptogen.randint", return_value=250):
85+
with caplog.at_level("INFO"), patch("src.automation.cryptogen.randint", return_value=250):
8686
await submit_listings(mock_page, form_url, sample_listings)
8787

8888
assert "0 successful, 3 failed" in caplog.text
@@ -107,7 +107,7 @@ async def test_submit_single_listing_flow_order() -> None:
107107
mock_page.wait_for_selector.side_effect = lambda *_, **__: call_order.append("wait_for_selector")
108108
mock_page.wait_for_timeout.side_effect = lambda _: call_order.append("wait_for_timeout")
109109

110-
with patch("src.form_submission.cryptogen.randint", return_value=250):
110+
with patch("src.automation.cryptogen.randint", return_value=250):
111111
await _submit_single_listing(mock_page, form_url, listing)
112112

113113
# Verify the sequence
@@ -123,30 +123,6 @@ async def test_submit_single_listing_flow_order() -> None:
123123
]
124124

125125

126-
@pytest.mark.asyncio
127-
async def test_submit_listings_uses_random_waits() -> None:
128-
"""Test that random wait times are used between submissions."""
129-
mock_page = AsyncMock()
130-
listings = [
131-
PropertyListing("Addr1", "$1000", "1000", "http://link1"),
132-
PropertyListing("Addr2", "$2000", "2000", "http://link2"),
133-
]
134-
form_url = "https://example.com/form"
135-
136-
wait_times = []
137-
mock_page.wait_for_timeout.side_effect = wait_times.append
138-
139-
with patch("src.form_submission.cryptogen.randint") as mock_randint:
140-
# Return different values for each call
141-
mock_randint.side_effect = [100, 150, 200, 250]
142-
await submit_listings(mock_page, form_url, listings)
143-
144-
# Should have wait_for_timeout called multiple times (2 per listing)
145-
assert len(wait_times) == 4
146-
# Verify the random values were used
147-
assert wait_times == [100, 150, 200, 250]
148-
149-
150126
@pytest.mark.parametrize(
151127
"empty_list_arg",
152128
[
@@ -177,7 +153,7 @@ async def test_submit_single_listing_field_mapping(mock_page: AsyncMock) -> None
177153
listing = PropertyListing(address="742 Evergreen Terrace", price="$2,500/mo", median_price="2500", link="https://zillow.com/listing/999")
178154
form_url = "https://example.com/form"
179155

180-
with patch("src.form_submission.cryptogen.randint", return_value=250):
156+
with patch("src.automation.cryptogen.randint", return_value=250):
181157
await _submit_single_listing(mock_page, form_url, listing)
182158

183159
fill_calls = mock_page.fill.call_args_list

0 commit comments

Comments
 (0)