Skip to content

Commit 18da4eb

Browse files
committed
remove rich snippet scraper and update keyword scraper
1 parent fa22579 commit 18da4eb

File tree

7 files changed

+167
-250
lines changed

7 files changed

+167
-250
lines changed

bing-scraper/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,4 @@ This Bing.com scraper uses __Python 3.10__ with [scrapfly-sdk](https://pypi.org/
4242
# or specific scraping areas
4343
$ poetry run pytest test.py -k test_serp_scraping
4444
$ poetry run pytest test.py -k test_keyword_scraping
45-
$ poetry run pytest test.py -k test_rich_snippets_scraping
4645
```

bing-scraper/bing.py

Lines changed: 3 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
"debug":True,
2323
"os":"linux",
2424
"auto_scroll":True,
25-
2625
}
2726

2827

@@ -38,7 +37,7 @@ def parse_serps(response: ScrapeApiResponse) -> List[Dict]:
3837
url = result.xpath(".//h2/a/@href").get()
3938
description = result.xpath("normalize-space(.//div/p)").extract_first()
4039
date = result.xpath(".//span[@class='news_dt']/text()").get()
41-
if data is not None and date is not None and len(date) > 12:
40+
if data is not None and len(date) > 12:
4241
date_pattern = re.compile(r"\b\d{2}-\d{2}-\d{4}\b")
4342
date_pattern.findall(description)
4443
dates = date_pattern.findall(date)
@@ -59,54 +58,14 @@ def parse_serps(response: ScrapeApiResponse) -> List[Dict]:
5958
)
6059
return data
6160

62-
6361
def parse_keywords(response: ScrapeApiResponse) -> Dict:
6462
"""parse FAQs and popular keywords on bing search pages"""
6563
selector = response.selector
66-
faqs = []
67-
for faq in selector.xpath("//*[*[div[contains(@data-tag, 'RelatedQnA.Item')]]]"):
68-
url = faq.xpath(".//a/@href").get()
69-
faqs.append(
70-
{
71-
"query": faq.xpath(".//div[contains(@data-tag, 'RelatedQnA.Item')]/@data-query").get(),
72-
"answer": faq.xpath(".//span[contains(@data-tag, 'QnA')]/text()").get(),
73-
"title": "".join(faq.xpath(".//div[@class='b_algo']/h2/*//text()").extract()),
74-
"domain": url.split("https://")[-1].split("/")[0].replace("www.", "")if url else None,
75-
"url": url,
76-
}
77-
)
7864
related_keywords = []
7965
for keyword in selector.xpath(".//li[@class='b_ans']/div/ul/li"):
8066
related_keywords.append("".join(keyword.xpath(".//a/div//text()").extract()))
8167

82-
return {"FAQs": faqs, "related_keywords": related_keywords}
83-
84-
85-
def parse_rich_snippet(response: ScrapeApiResponse) -> Dict:
86-
"""parse rich snippets from Bing search"""
87-
selector = response.selector
88-
data = {}
89-
data["title"] = " ".join(selector.xpath("//div[@class='l_ecrd_hero_ttl']//h2//text()").getall())
90-
data["link"] = selector.xpath("//div[@class='l_ecrd_hero_ttl']/div/a/@href").get()
91-
data["heading"] = " ".join(selector.xpath("//a[@title]/h2/span/text()").getall())
92-
data["links"] = {}
93-
for item in selector.xpath("//div[contains(@class, 'webicons')]/div"):
94-
name = item.xpath(".//a/@title").get()
95-
link = item.xpath(".//a/@href").get()
96-
data["links"][name] = link
97-
98-
data["info"] = {}
99-
for row in selector.xpath("//div[contains(@class, 'expansion')]/div[contains(@class, 'row')]"):
100-
key = row.xpath(".//div/div/a[1]/text()").get().strip()
101-
value = row.xpath("string(.//div[not(contains(@class, 'title'))])").get().strip().replace(key, "")
102-
data["info"][key] = value
103-
104-
all_text = ""
105-
for div_element in selector.xpath("//div[@class='lite-entcard-blk l_ecrd_bkg_hlt']"):
106-
div_text = div_element.xpath("string(.)").get().strip()
107-
all_text += div_text + "\n"
108-
data["descrption"] = all_text
109-
return data
68+
return related_keywords
11069

11170

11271
async def scrape_search(query: str, max_pages: int = None):
@@ -130,24 +89,13 @@ async def scrape_search(query: str, max_pages: int = None):
13089
log.success(f"scraped {len(serp_data)} search results from Bing search")
13190
return serp_data
13291

133-
13492
async def scrape_keywords(query: str):
13593
"""scrape bing search pages for keyword data"""
13694
url = f"https://www.bing.com/search?{urlencode({'q': query})}"
13795
log.info("scraping Bing search for keyword data")
13896
response = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG, render_js=True))
13997
keyword_data = parse_keywords(response)
14098
log.success(
141-
f"scraped {len(keyword_data['related_keywords'])} keywords and {len(keyword_data['FAQs'])} FAQs from Bing search"
99+
f"scraped {len(keyword_data)} keywords from Bing search"
142100
)
143101
return keyword_data
144-
145-
146-
async def scrape_rich_snippets(query: str):
147-
"""scrape bing search pages for rich snippets data"""
148-
url = f"https://www.bing.com/search?{urlencode({'q': query})}"
149-
log.info("scraping Bing search for keyword data")
150-
response = await SCRAPFLY.async_scrape(ScrapeConfig(url, asp=True, country="GB", render_js=True))
151-
rich_snippet_data = parse_rich_snippet(response)
152-
log.success(f"scraped {len(rich_snippet_data)} rich snippets fields from Bing search")
153-
return rich_snippet_data

bing-scraper/results/keywords.json

Lines changed: 18 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,18 @@
1-
{
2-
"FAQs": [
3-
{
4-
"query": "How to scrape emails from a website?",
5-
"answer": "We will have to set up our scraper to click on each profile and extract their email. Next, you will need a web scraper that can scrape emails from any website. For this example, we will download and install ParseHub, a free and powerful web scraper that works with any website. Now it’s time to get scraping. Open ParseHub and click on “New Project”.",
6-
"title": "How to Scrape Emails from any Website | ParseHub",
7-
"domain": "bing.com",
8-
"url": "https://www.bing.com/ck/a?!&&p=41cadb349b48845d81946a4548ede103e848c4600607f191c287af779dcfe29dJmltdHM9MTc1NzI4OTYwMA&ptn=3&ver=2&hsh=4&fclid=1ecd1879-483a-6ddd-344b-0e1849f86cee&u=a1aHR0cHM6Ly93d3cucGFyc2VodWIuY29tL2Jsb2cvc2NyYXBlLWVtYWlscy8&ntb=1"
9-
},
10-
{
11-
"query": "How does email scraping work?",
12-
"answer": "The key is that scraping tools — or email scrapers — are built to ",
13-
"title": "How to Scrape Emails for Marketing (+ 6 Top Tools in 2025)",
14-
"domain": "bing.com",
15-
"url": "https://www.bing.com/ck/a?!&&p=8757a5f5e1a0f4c8e2b87e05d94f4add67393122202cac7d08398b06eaf90256JmltdHM9MTc1NzI4OTYwMA&ptn=3&ver=2&hsh=4&fclid=1ecd1879-483a-6ddd-344b-0e1849f86cee&u=a1aHR0cHM6Ly93d3cubGluZHkuYWkvYmxvZy9zY3JhcGluZy1lbWFpbHM&ntb=1"
16-
},
17-
{
18-
"query": "What happens if you scrape emails?",
19-
"answer": "Technical: Bad scraping can ",
20-
"title": "How to Scrape Emails for Marketing (+ 6 Top Tools in 2025)",
21-
"domain": "bing.com",
22-
"url": "https://www.bing.com/ck/a?!&&p=8757a5f5e1a0f4c8e2b87e05d94f4add67393122202cac7d08398b06eaf90256JmltdHM9MTc1NzI4OTYwMA&ptn=3&ver=2&hsh=4&fclid=1ecd1879-483a-6ddd-344b-0e1849f86cee&u=a1aHR0cHM6Ly93d3cubGluZHkuYWkvYmxvZy9zY3JhcGluZy1lbWFpbHM&ntb=1"
23-
},
24-
{
25-
"query": "Should you use a web scraper for email marketing?",
26-
"answer": "With the help of a free web scraper and by carefully selecting your lead sources, you can quickly build a high-quality email list. You can then use this list for email marketing efforts or use it as a custom audience in Google or Facebook Ads. Also, many companies web scrape email lists for their prospecting or cold email outreach.",
27-
"title": "How to Scrape Emails from any Website | ParseHub",
28-
"domain": "bing.com",
29-
"url": "https://www.bing.com/ck/a?!&&p=41cadb349b48845d81946a4548ede103e848c4600607f191c287af779dcfe29dJmltdHM9MTc1NzI4OTYwMA&ptn=3&ver=2&hsh=4&fclid=1ecd1879-483a-6ddd-344b-0e1849f86cee&u=a1aHR0cHM6Ly93d3cucGFyc2VodWIuY29tL2Jsb2cvc2NyYXBlLWVtYWlscy8&ntb=1"
30-
},
31-
{
32-
"query": "What is the best tool for scraping email addresses?",
33-
"answer": "This saves all the unique scraped email addresses to a CSV file named \"scraped_emails.csv\". While ",
34-
"title": "How to Scrape Emails from Any Website in 2025: The Ultimate Guide",
35-
"domain": "bing.com",
36-
"url": "https://www.bing.com/ck/a?!&&p=b81d9e6a78395567b82499d19db64ba9600724423404794bfa655b0ddefe6ef9JmltdHM9MTc1NzI4OTYwMA&ptn=3&ver=2&hsh=4&fclid=1ecd1879-483a-6ddd-344b-0e1849f86cee&u=a1aHR0cHM6Ly93d3cubWFya2V0aW5nc2Nvb3AuY29tL3RlY2gvd2ViLXNjcmFwaW5nL2hvdy10by1zY3JhcGUtZW1haWxzLWZyb20tYW55LXdlYnNpdGUtZm9yLXNhbGVzLXByb3NwZWN0aW5nLw&ntb=1"
37-
},
38-
{
39-
"query": "How to scrape emails from Google Maps?",
40-
"answer": "If that’s not enough, we wrote a separate guide on scraping emails from Google Maps. To run the scraper, enter your HasData API key and either a list of websites or keywords, depending on the method you choose. Then, run the scraper: When it’s done, you can copy the data table (or part of it) or download the data as JSON or CSV.",
41-
"title": "Scrape Emails from Any Websites: From Python to AI Tools",
42-
"domain": "bing.com",
43-
"url": "https://www.bing.com/ck/a?!&&p=834f7d6add2db48cca6a6ef4bb289361a9bb65158afb1ef5f0db1296039c7d73JmltdHM9MTc1NzI4OTYwMA&ptn=3&ver=2&hsh=4&fclid=1ecd1879-483a-6ddd-344b-0e1849f86cee&u=a1aHR0cHM6Ly9oYXNkYXRhLmNvbS9ibG9nL2VtYWlsLXNjcmFwaW5n&ntb=1"
44-
}
45-
],
46-
"related_keywords": [
47-
"free email scraping tool",
48-
"website email scraper",
49-
"web email scraper",
50-
"best free online email extractor",
51-
"how to delete emails from website",
52-
"pull email addresses from website",
53-
"extract email from website free",
54-
"extract emails from website",
55-
"free email grabber",
56-
"extract email address from website",
57-
"extract email addresses from website",
58-
"email extractor from website free",
59-
"free email scraping tool",
60-
"website email scraper",
61-
"web email scraper",
62-
"best free online email extractor",
63-
"how to delete emails from website",
64-
"pull email addresses from website",
65-
"extract email from website free",
66-
"extract emails from website",
67-
"free email grabber",
68-
"extract email address from website",
69-
"extract email addresses from website",
70-
"email extractor from website free"
71-
]
72-
}
1+
[
2+
"email scraper",
3+
"email scraping software",
4+
"extract emails from web page",
5+
"scrape emails from website free",
6+
"scrape email addresses from websites",
7+
"scrape emails from website",
8+
"pull email addresses from website",
9+
"how to delete emails from website",
10+
"email scraper",
11+
"email scraping software",
12+
"extract emails from web page",
13+
"scrape emails from website free",
14+
"scrape email addresses from websites",
15+
"scrape emails from website",
16+
"pull email addresses from website",
17+
"how to delete emails from website"
18+
]

bing-scraper/results/rich_snippets.json

Lines changed: 0 additions & 8 deletions
This file was deleted.

0 commit comments

Comments
 (0)