Skip to content

Commit bfb1e89

Browse files
authored
Merge pull request #2635 from Mihan786Chistie/techCrunch
added Tech crunch Scraper
2 parents 57fe6d4 + 2dc2121 commit bfb1e89

File tree

3 files changed

+173
-0
lines changed

3 files changed

+173
-0
lines changed

TechCrunch-Scraper/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
## Tech Crunch
2+
3+
### Scrape articles with title, descriptions, images, author, date and link
4+
5+
Create an instance of `TechCrunch` class.
6+
7+
```python
8+
articles = TechCrunch()
9+
```
10+
11+
| Methods | Details |
12+
| ---------------- | ---------------------------------------------------------------------------------------------------------------------- |
13+
| `.getArticles()` | Returns the articles with title, descriptions, images, author, date and link regarding a category in JSON format |
14+
| `.search()` | Returns the searched articles with title, descriptions, images, author, date and link regarding a topic in JSON format |
15+
16+
---

TechCrunch-Scraper/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
beautifulsoup4
2+
requests
3+
json

TechCrunch-Scraper/techCrunch.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import json
4+
5+
6+
class TechCrunch:
7+
"""
8+
Class - `TechCrunch`
9+
Example:
10+
```
11+
articles = TechCrunch()
12+
```\n
13+
Methods :\n
14+
1. ``.getArticles() | Response - Articles with title, descriptions, images, date and link.
15+
"""
16+
17+
def get_articles(self, category):
18+
19+
"""
20+
Class - `TechCrunch`
21+
Example:
22+
```
23+
articles = TechCrunch()
24+
articles.getArticles("artificial-intelligence")
25+
```
26+
Returns:
27+
{
28+
"title": Tile of the article
29+
"description": Description of the article
30+
"image": Image of the article
31+
"author": Author of the Article
32+
"date": Date the article was posted
33+
"link": Link to the article
34+
}
35+
"""
36+
url = (
37+
"https://techcrunch.com/category/" + category.replace(" ", "-").lower()
38+
)
39+
try:
40+
res = requests.get(url)
41+
soup = BeautifulSoup(res.text, "html.parser")
42+
43+
articles_data = {"articles": []}
44+
45+
articles = soup.find_all(
46+
"div", class_="post-block post-block--image post-block--unread"
47+
)
48+
for n in articles:
49+
name = (
50+
n.select_one(".post-block__title__link")
51+
.getText()
52+
.strip()
53+
.encode("ascii", "ignore")
54+
.decode()
55+
)
56+
desc = (
57+
n.select_one(".post-block__content")
58+
.getText()
59+
.strip()
60+
.encode("ascii", "ignore")
61+
.decode()
62+
)
63+
img = n.find_all("img", src=True)
64+
image = img[0]["src"]
65+
author = (
66+
n.select_one(".river-byline__authors")
67+
.getText()
68+
.strip()
69+
.encode("ascii", "ignore")
70+
.decode()
71+
)
72+
time = n.find_all("div", class_="river-byline")
73+
date = (
74+
time[0]
75+
.select_one(".river-byline__time")
76+
.getText()
77+
.strip()
78+
.encode("ascii", "ignore")
79+
.decode()
80+
)
81+
links = n.find_all("a", class_="post-block__title__link", href=True)
82+
link = links[0]["href"]
83+
articles_data["articles"].append(
84+
{
85+
"title": name,
86+
"description": desc,
87+
"image": image,
88+
"author": author,
89+
"date": date,
90+
"link": link,
91+
}
92+
)
93+
res_json = json.dumps(articles_data)
94+
return res_json
95+
except ValueError:
96+
error_message = {
97+
"message": "Can't fetch any articles from the topic provided."
98+
}
99+
ejson = json.dumps(error_message)
100+
return ejson
101+
102+
def search(self, topic):
103+
104+
"""
105+
Class - `TechCrunch`
106+
Example:
107+
```
108+
articles = TechCrunch()
109+
articles.search("github")
110+
```
111+
Returns:
112+
{
113+
"title": Tile of the article
114+
"description": Description of the article
115+
"image": Image of the article
116+
"author": Author of the Article
117+
"date": Date the article was posted
118+
"link": Link to the article
119+
}
120+
"""
121+
url = "https://search.techcrunch.com/search?p=" + topic + "&fr=techcrunch"
122+
try:
123+
res = requests.get(url)
124+
soup = BeautifulSoup(res.text, "html.parser")
125+
126+
articles_data = {"articles": []}
127+
128+
articles = soup.find_all("li", class_="ov-a mt-0 pt-26 pb-26 bt-dbdbdb")
129+
for i in articles:
130+
name = i.find("a", class_="fz-20 lh-22 fw-b").getText()
131+
desc = i.find("p", class_="fz-14 lh-20 c-777").getText()
132+
img = i.find("img", class_="s-img mr-10 s-img-errchk", src=True)
133+
image = img["src"]
134+
author = i.find("span", class_="mr-15").getText()
135+
date = i.find("span", class_="pl-15 bl-1-666").getText()
136+
links = i.find("a", class_="fz-20 lh-22 fw-b", href=True)
137+
link = links["href"]
138+
articles_data["articles"].append(
139+
{
140+
"title": name,
141+
"description": desc,
142+
"image": image,
143+
"author": author,
144+
"date": date,
145+
"link": link,
146+
}
147+
)
148+
return articles_data
149+
except ValueError:
150+
error_message = {
151+
"message": "Can't fetch any articles from the topic provided."
152+
}
153+
ejson = json.dumps(error_message)
154+
return ejson

0 commit comments

Comments
 (0)