Skip to content

Commit 0660e53

Browse files
authored
Merge pull request #62 from scrapfly/fix-google.com
Fix google.com
2 parents 0415119 + d52100a commit 0660e53

File tree

5 files changed

+235
-219
lines changed

5 files changed

+235
-219
lines changed

google-scraper/google.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"asp": True,
2222
# set the poxy location to US
2323
"country": "US",
24+
"render_js": True,
2425
}
2526

2627

@@ -53,7 +54,7 @@ def aria_no_label(label):
5354
"phone": aria_no_label("Phone: "),
5455
"review_count": aria_with_label(" reviews").get(),
5556
# to extract star numbers from text we can use regex pattern for numbers: "\d+"
56-
"stars": aria_with_label(" stars").re("\d+.*\d+")[0],
57+
"stars": aria_with_label(" stars").re(r"\d+.*\d+")[0],
5758
"5_stars": aria_with_label("5 stars").re(r"(\d+) review")[0],
5859
"4_stars": aria_with_label("4 stars").re(r"(\d+) review")[0],
5960
"3_stars": aria_with_label("3 stars").re(r"(\d+) review")[0],
@@ -70,7 +71,6 @@ async def scrape_google_map_places(urls: List[str]) -> List[Dict]:
7071
ScrapeConfig(
7172
url=url,
7273
**BASE_CONFIG,
73-
render_js=True,
7474
wait_for_selector="//button[contains(@jsaction, 'reviewlegaldisclosure')]",
7575
)
7676
for url in urls
@@ -152,19 +152,23 @@ def parse_keywords(response: ScrapeApiResponse) -> List[str]:
152152
"""parse keywords from google search pages"""
153153
selector = response.selector
154154
related_search = []
155-
for suggestion in selector.xpath(
156-
"//div[div/div/span[contains(text(), 'search for')]]/following-sibling::div//a"
157-
):
158-
related_search.append("".join(suggestion.xpath(".//text()").getall()))
159-
people_ask_for = selector.css(".related-question-pair span::text").getall()
155+
156+
for suggestion in selector.xpath("//div[.//span[contains(text(), 'search for')]]/following-sibling::div//a[contains(@href, '/search')]"):
157+
text = "".join(suggestion.xpath(".//text()").getall())
158+
text = "".join(suggestion.xpath(".//text()").getall()).strip()
159+
if len(text.split()) > 2:
160+
related_search.append(text)
161+
162+
163+
people_ask_for = [question.strip() for question in selector.css(".related-question-pair .CSkcDe::text").getall() if question.strip()]
160164
return {"related_search": related_search, "people_ask_for": people_ask_for}
161165

162166

163167
async def scrape_keywords(query: str) -> List[str]:
164168
"""request google search page for keyword data"""
165169
response = await SCRAPFLY.async_scrape(
166170
ScrapeConfig(
167-
f"https://www.google.com/search?hl=en&q={quote(query)}", **BASE_CONFIG, render_js=True
171+
f"https://www.google.com/search?hl=en&q={quote(query)}", **BASE_CONFIG
168172
)
169173
)
170174
data = parse_keywords(response)
Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,44 @@
11
[
22
{
3-
"name": "Louvre Museum",
3+
"name": "Musée de l'Orangerie",
44
"category": "Art museum",
5-
"address": "75001 Paris, France",
6-
"website": "louvre.fr",
7-
"phone": "+33 1 40 20 53 17",
8-
"review_count": "336,931 reviews",
9-
"stars": "4.7",
10-
"5_stars": "605",
11-
"4_stars": "636",
12-
"3_stars": "104",
13-
"2_stars": "045",
14-
"1_stars": "541"
5+
"address": "Jardin des Tuileries, 75001 Paris, France",
6+
"website": "musee-orangerie.fr",
7+
"phone": "+33 1 44 50 43 00",
8+
"review_count": "21,651 reviews",
9+
"stars": "4.6",
10+
"5_stars": "080",
11+
"4_stars": "831",
12+
"3_stars": "111",
13+
"2_stars": "268",
14+
"1_stars": "361"
15+
},
16+
{
17+
"name": "The Centre Pompidou",
18+
"category": "Cultural center",
19+
"address": "Place Georges-Pompidou, 75004 Paris, France",
20+
"website": "centrepompidou.fr",
21+
"phone": "+33 1 44 78 12 33",
22+
"review_count": "57,289 reviews",
23+
"stars": "4.4",
24+
"5_stars": "416",
25+
"4_stars": "342",
26+
"3_stars": "983",
27+
"2_stars": "245",
28+
"1_stars": "303"
1529
},
1630
{
1731
"name": "Musée d'Orsay",
1832
"category": "Art museum",
1933
"address": "Esplanade Valéry Giscard d'Estaing, 75007 Paris, France",
2034
"website": "musee-orsay.fr",
2135
"phone": "+33 1 40 49 48 14",
22-
"review_count": "99,580 reviews",
36+
"review_count": "106,795 reviews",
2337
"stars": "4.8",
24-
"5_stars": "985",
25-
"4_stars": "492",
26-
"3_stars": "399",
27-
"2_stars": "617",
28-
"1_stars": "087"
29-
},
30-
{
31-
"name": "Musée de l'Orangerie",
32-
"category": "Art museum",
33-
"address": "Jardin des Tuileries, 75001 Paris, France",
34-
"website": "musee-orangerie.fr",
35-
"phone": "+33 1 44 50 43 00",
36-
"review_count": "19,803 reviews",
37-
"stars": "4.6",
38-
"5_stars": "667",
39-
"4_stars": "564",
40-
"3_stars": "998",
41-
"2_stars": "245",
42-
"1_stars": "329"
38+
"5_stars": "112",
39+
"4_stars": "250",
40+
"3_stars": "567",
41+
"2_stars": "683",
42+
"1_stars": "183"
4343
}
4444
]
Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,22 @@
11
[
2-
"https://www.google.com/maps/place/Carnavalet+Museum/data=!4m7!3m6!1s0x47e66e00f9521b7d:0xc8c16b75253918c1!8m2!3d48.8570669!4d2.3628579!16zL20vMDZnM2h4!19sChIJfRtS-QBu5kcRwRg5JXVrwcg?authuser=0&hl=en&rclk=1",
3-
"https://www.google.com/maps/place/Fondation+Louis+Vuitton/data=!4m7!3m6!1s0x47e6655d45f42a25:0x85d645fc15bab5cd!8m2!3d48.8766453!4d2.2635239!16s%2Fm%2F0rytz6f!19sChIJJSr0RV1l5kcRzbW6FfxF1oU?authuser=0&hl=en&rclk=1",
4-
"https://www.google.com/maps/place/Louvre+Museum/data=!4m7!3m6!1s0x47e671d877937b0f:0xb975fcfa192f84d4!8m2!3d48.8606111!4d2.337644!16zL20vMDRnZHI!19sChIJD3uTd9hx5kcR1IQvGfr8dbk?authuser=0&hl=en&rclk=1",
52
"https://www.google.com/maps/place/Mus%C3%A9e+d%27Orsay/data=!4m7!3m6!1s0x47e66e2bb630941b:0xd071bd8cb14423d8!8m2!3d48.8599614!4d2.3265614!16zL20vMGYzYjk!19sChIJG5Qwtitu5kcR2CNEsYy9cdA?authuser=0&hl=en&rclk=1",
3+
"https://www.google.com/maps/place/Mus%C3%A9e+d%27Art+Moderne+de+Paris/data=!4m7!3m6!1s0x47e66fe7988e74af:0x60dec2cd0e194c25!8m2!3d48.8643421!4d2.2978208!16s%2Fm%2F03mhsmh!19sChIJr3SOmOdv5kcRJUwZDs3C3mA?authuser=0&hl=en&rclk=1",
4+
"https://www.google.com/maps/place/Louvre+Museum/data=!4m7!3m6!1s0x47e671d877937b0f:0xb975fcfa192f84d4!8m2!3d48.8606111!4d2.337644!16zL20vMDRnZHI!19sChIJD3uTd9hx5kcR1IQvGfr8dbk?authuser=0&hl=en&rclk=1",
65
"https://www.google.com/maps/place/Mus%C3%A9e+de+l%27Orangerie/data=!4m7!3m6!1s0x47e66e2eeaaaaaa3:0xdc3fd08aa701960a!8m2!3d48.8637884!4d2.3226724!16zL20vMGR0M21s!19sChIJo6qq6i5u5kcRCpYBp4rQP9w?authuser=0&hl=en&rclk=1",
7-
"https://www.google.com/maps/place/Petit+Palais/data=!4m7!3m6!1s0x47e66fd1ce8f4349:0xf3f5be813dd2469c!8m2!3d48.8660479!4d2.3145896!16zL20vMDhkc3du!19sChIJSUOPztFv5kcRnEbSPYG-9fM?authuser=0&hl=en&rclk=1"
6+
"https://www.google.com/maps/place/Mus%C3%A9e+du+quai+Branly+-+Jacques+Chirac/data=!4m7!3m6!1s0x47e66fe0da76cf63:0xb7949d5df6b04424!8m2!3d48.8608889!4d2.297894!16zL20vMGRtbng2!19sChIJY8922uBv5kcRJESw9l2dlLc?authuser=0&hl=en&rclk=1",
7+
"https://www.google.com/maps/place/Mus%C3%A9e+Jacquemart-Andr%C3%A9/data=!4m7!3m6!1s0x47e66fc7b7aef5a5:0x5d7785b84c933a2d!8m2!3d48.875553!4d2.310422!16s%2Fm%2F027cfpq!19sChIJpfWut8dv5kcRLTqTTLiFd10?authuser=0&hl=en&rclk=1",
8+
"https://www.google.com/maps/place/Mus%C3%A9e+National+Picasso-Paris/data=!4m7!3m6!1s0x47e66e0142746b4b:0xc191da9af2fc6df7!8m2!3d48.8598775!4d2.362285!16zL20vMDR4Znpm!19sChIJS2t0QgFu5kcR92388prakcE?authuser=0&hl=en&rclk=1",
9+
"https://www.google.com/maps/place/Mus%C3%A9e+Rodin/data=!4m7!3m6!1s0x47e6702a1eccdb43:0x6468d82811b67058!8m2!3d48.8553072!4d2.3158354!16zL20vMDN5eHpf!19sChIJQ9vMHipw5kcRWHC2ESjYaGQ?authuser=0&hl=en&rclk=1",
10+
"https://www.google.com/maps/place/Fondation+Louis+Vuitton/data=!4m7!3m6!1s0x47e6655d45f42a25:0x85d645fc15bab5cd!8m2!3d48.8766453!4d2.2635239!16s%2Fm%2F0rytz6f!19sChIJJSr0RV1l5kcRzbW6FfxF1oU?authuser=0&hl=en&rclk=1",
11+
"https://www.google.com/maps/place/Mus%C3%A9e+national+des+arts+asiatiques+-+Guimet/data=!4m7!3m6!1s0x47e66fe5cd3929c3:0xb5ec0412a7bc9c89!8m2!3d48.8651018!4d2.2937604!16zL20vMDY4ODdf!19sChIJwyk5zeVv5kcRiZy8pxIE7LU?authuser=0&hl=en&rclk=1",
12+
"https://www.google.com/maps/place/The+Army+Museum/data=!4m7!3m6!1s0x47e66fd7ecd7eabf:0x761e79ce4d1d4227!8m2!3d48.8557933!4d2.3125934!16zL20vMGMycHM0!19sChIJv-rX7Ndv5kcRJ0IdTc55HnY?authuser=0&hl=en&rclk=1",
13+
"https://www.google.com/maps/place/National+Museum+of+Natural+History/data=!4m7!3m6!1s0x47e671f12d404411:0x4743d62149f1c6e1!8m2!3d48.8417009!4d2.3560169!16zL20vMDJsbWJi!19sChIJEURALfFx5kcR4cbxSSHWQ0c?authuser=0&hl=en&rclk=1",
14+
"https://www.google.com/maps/place/Carnavalet+Museum/data=!4m7!3m6!1s0x47e66e00f9521b7d:0xc8c16b75253918c1!8m2!3d48.8570669!4d2.3628579!16zL20vMDZnM2h4!19sChIJfRtS-QBu5kcRwRg5JXVrwcg?authuser=0&hl=en&rclk=1",
15+
"https://www.google.com/maps/place/Petit+Palais/data=!4m7!3m6!1s0x47e66fd1ce8f4349:0xf3f5be813dd2469c!8m2!3d48.8660479!4d2.3145896!16zL20vMDhkc3du!19sChIJSUOPztFv5kcRnEbSPYG-9fM?authuser=0&hl=en&rclk=1",
16+
"https://www.google.com/maps/place/Mus%C3%A9e+Gr%C3%A9vin/data=!4m7!3m6!1s0x47e66e3e9be04a55:0x7def1a3ff98df458!8m2!3d48.8718378!4d2.3422204!16zL20vMDZicGJ6!19sChIJVUrgmz5u5kcRWPSN-T8a730?authuser=0&hl=en&rclk=1",
17+
"https://www.google.com/maps/place/Mus%C3%A9e+Marmottan+Monet/data=!4m7!3m6!1s0x47e665529447f461:0x4ec7611155da854c!8m2!3d48.8592613!4d2.2672282!16zL20vMDZuMDlx!19sChIJYfRHlFJl5kcRTIXaVRFhx04?authuser=0&hl=en&rclk=1",
18+
"https://www.google.com/maps/place/Dali+Museum/data=!4m7!3m6!1s0x47e66e44ca858f05:0xef88ab04490097e9!8m2!3d48.8865119!4d2.3398718!16zL20vMGYyYmxo!19sChIJBY-FykRu5kcR6ZcASQSriO8?authuser=0&hl=en&rclk=1",
19+
"https://www.google.com/maps/place/Cluny+Museum+-+National+Museum+of+the+Middle+Ages/data=!4m7!3m6!1s0x47e671ddf5e1132f:0x5d66e4a5335b37f!8m2!3d48.8504833!4d2.3440808!16zL20vMDF6a3F0!19sChIJLxPh9d1x5kcRf7M1U0pu1gU?authuser=0&hl=en&rclk=1",
20+
"https://www.google.com/maps/place/Mus%C3%A9e+des+Arts+et+M%C3%A9tiers/data=!4m7!3m6!1s0x47e66e1aa6c84435:0x772045ec563b1de2!8m2!3d48.8660399!4d2.3556152!16zL20vMDI4MXFo!19sChIJNUTIphpu5kcR4h07VuxFIHc?authuser=0&hl=en&rclk=1",
21+
"https://www.google.com/maps/place/Perfume+Museum/data=!4m7!3m6!1s0x47e66e31357d11e1:0x9d22b47d6d2b4c4a!8m2!3d48.871493!4d2.330266!16s%2Fm%2F04gt6xq!19sChIJ4RF9NTFu5kcRSkwrbX20Ip0?authuser=0&hl=en&rclk=1"
822
]
Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
{
22
"related_search": [
33
"Web scraping emails free",
4-
"Web scraping emails yahoo",
54
"Web scraping emails gmail",
6-
"Scrape emails from website free",
7-
"Free email scraper",
8-
"Scrape emails from website Python",
5+
"Email scraper free",
96
"Best free email scraper",
10-
"AI email scraper"
7+
"AI email scraper",
8+
"Free email scraper extension"
119
],
1210
"people_ask_for": [
1311
"Is web scraping emails legal?",
14-
"Can you scrape emails from websites?",
15-
"Is email harvesting illegal?",
16-
"Does email scraping work?"
12+
"How to scrape emails from the web?",
13+
"Can ChatGPT scrape emails?",
14+
"Is web scraping on Google illegal?"
1715
]
1816
}

0 commit comments

Comments
 (0)