Skip to content

Commit aa53dbe

Browse files
refactor: clean up related search parsing in Google scraper
1 parent 0415119 commit aa53dbe

File tree

1 file changed

+12
-8
lines changed

1 file changed

+12
-8
lines changed

google-scraper/google.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"asp": True,
2222
# set the poxy location to US
2323
"country": "US",
24+
"render_js": True,
2425
}
2526

2627

@@ -53,7 +54,7 @@ def aria_no_label(label):
5354
"phone": aria_no_label("Phone: "),
5455
"review_count": aria_with_label(" reviews").get(),
5556
# to extract star numbers from text we can use regex pattern for numbers: "\d+"
56-
"stars": aria_with_label(" stars").re("\d+.*\d+")[0],
57+
"stars": aria_with_label(" stars").re(r"\d+.*\d+")[0],
5758
"5_stars": aria_with_label("5 stars").re(r"(\d+) review")[0],
5859
"4_stars": aria_with_label("4 stars").re(r"(\d+) review")[0],
5960
"3_stars": aria_with_label("3 stars").re(r"(\d+) review")[0],
@@ -70,7 +71,6 @@ async def scrape_google_map_places(urls: List[str]) -> List[Dict]:
7071
ScrapeConfig(
7172
url=url,
7273
**BASE_CONFIG,
73-
render_js=True,
7474
wait_for_selector="//button[contains(@jsaction, 'reviewlegaldisclosure')]",
7575
)
7676
for url in urls
@@ -152,19 +152,23 @@ def parse_keywords(response: ScrapeApiResponse) -> List[str]:
152152
"""parse keywords from google search pages"""
153153
selector = response.selector
154154
related_search = []
155-
for suggestion in selector.xpath(
156-
"//div[div/div/span[contains(text(), 'search for')]]/following-sibling::div//a"
157-
):
158-
related_search.append("".join(suggestion.xpath(".//text()").getall()))
159-
people_ask_for = selector.css(".related-question-pair span::text").getall()
155+
156+
for suggestion in selector.xpath("//div[.//span[contains(text(), 'search for')]]/following-sibling::div//a[contains(@href, '/search')]"):
157+
text = "".join(suggestion.xpath(".//text()").getall())
158+
text = "".join(suggestion.xpath(".//text()").getall()).strip()
159+
if len(text.split()) > 2:
160+
related_search.append(text)
161+
162+
163+
people_ask_for = [question.strip() for question in selector.css(".related-question-pair .CSkcDe::text").getall() if question.strip()]
160164
return {"related_search": related_search, "people_ask_for": people_ask_for}
161165

162166

163167
async def scrape_keywords(query: str) -> List[str]:
164168
"""request google search page for keyword data"""
165169
response = await SCRAPFLY.async_scrape(
166170
ScrapeConfig(
167-
f"https://www.google.com/search?hl=en&q={quote(query)}", **BASE_CONFIG, render_js=True
171+
f"https://www.google.com/search?hl=en&q={quote(query)}", **BASE_CONFIG
168172
)
169173
)
170174
data = parse_keywords(response)

0 commit comments

Comments
 (0)