Merge pull request #2635 from Mihan786Chistie/techCrunch

1e9abhi1e10 · web-flow · commit bfb1e89d2131 · 2023-08-08T22:06:02.000+05:30
added Tech crunch Scraper
diff --git a/TechCrunch-Scraper/README.md b/TechCrunch-Scraper/README.md
@@ -0,0 +1,16 @@
+## Tech Crunch
+
+### Scrape articles with title, descriptions, images, author, date and link
+
+Create an instance of `TechCrunch` class.
+
+```python
+articles = TechCrunch()
+```
+
+| Methods          | Details                                                                                                                |
+| ---------------- | ---------------------------------------------------------------------------------------------------------------------- |
+| `.getArticles()` | Returns the articles with title, descriptions, images, author, date and link regarding a category in JSON format       |
+| `.search()`      | Returns the searched articles with title, descriptions, images, author, date and link regarding a topic in JSON format |
+
+---
diff --git a/TechCrunch-Scraper/requirements.txt b/TechCrunch-Scraper/requirements.txt
@@ -0,0 +1,3 @@
+beautifulsoup4
+requests
+json
diff --git a/TechCrunch-Scraper/techCrunch.py b/TechCrunch-Scraper/techCrunch.py
@@ -0,0 +1,154 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+
+
+class TechCrunch:
+    """
+    Class - `TechCrunch`
+    Example:
+    ```
+    articles = TechCrunch()
+    ```\n
+    Methods :\n
+    1. ``.getArticles() | Response - Articles with title, descriptions, images, date and link.
+    """
+
+    def get_articles(self, category):
+
+        """
+        Class - `TechCrunch`
+        Example:
+        ```
+        articles = TechCrunch()
+        articles.getArticles("artificial-intelligence")
+        ```
+        Returns:
+        {
+            "title": Tile of the article
+            "description": Description of the article
+            "image": Image of the article
+            "author": Author of the Article
+            "date": Date the article was posted
+            "link": Link to the article
+        }
+        """
+        url = (
+            "https://techcrunch.com/category/" + category.replace(" ", "-").lower()
+        )
+        try:
+            res = requests.get(url)
+            soup = BeautifulSoup(res.text, "html.parser")
+
+            articles_data = {"articles": []}
+
+            articles = soup.find_all(
+                "div", class_="post-block post-block--image post-block--unread"
+            )
+            for n in articles:
+                name = (
+                    n.select_one(".post-block__title__link")
+                    .getText()
+                    .strip()
+                    .encode("ascii", "ignore")
+                    .decode()
+                )
+                desc = (
+                    n.select_one(".post-block__content")
+                    .getText()
+                    .strip()
+                    .encode("ascii", "ignore")
+                    .decode()
+                )
+                img = n.find_all("img", src=True)
+                image = img[0]["src"]
+                author = (
+                    n.select_one(".river-byline__authors")
+                    .getText()
+                    .strip()
+                    .encode("ascii", "ignore")
+                    .decode()
+                )
+                time = n.find_all("div", class_="river-byline")
+                date = (
+                    time[0]
+                    .select_one(".river-byline__time")
+                    .getText()
+                    .strip()
+                    .encode("ascii", "ignore")
+                    .decode()
+                )
+                links = n.find_all("a", class_="post-block__title__link", href=True)
+                link = links[0]["href"]
+                articles_data["articles"].append(
+                    {
+                        "title": name,
+                        "description": desc,
+                        "image": image,
+                        "author": author,
+                        "date": date,
+                        "link": link,
+                    }
+                )
+            res_json = json.dumps(articles_data)
+            return res_json
+        except ValueError:
+            error_message = {
+                "message": "Can't fetch any articles from the topic provided."
+            }
+            ejson = json.dumps(error_message)
+            return ejson
+
+    def search(self, topic):
+
+        """
+        Class - `TechCrunch`
+        Example:
+        ```
+        articles = TechCrunch()
+        articles.search("github")
+        ```
+        Returns:
+        {
+            "title": Tile of the article
+            "description": Description of the article
+            "image": Image of the article
+            "author": Author of the Article
+            "date": Date the article was posted
+            "link": Link to the article
+        }
+        """
+        url = "https://search.techcrunch.com/search?p=" + topic + "&fr=techcrunch"
+        try:
+            res = requests.get(url)
+            soup = BeautifulSoup(res.text, "html.parser")
+
+            articles_data = {"articles": []}
+
+            articles = soup.find_all("li", class_="ov-a mt-0 pt-26 pb-26 bt-dbdbdb")
+            for i in articles:
+                name = i.find("a", class_="fz-20 lh-22 fw-b").getText()
+                desc = i.find("p", class_="fz-14 lh-20 c-777").getText()
+                img = i.find("img", class_="s-img mr-10 s-img-errchk", src=True)
+                image = img["src"]
+                author = i.find("span", class_="mr-15").getText()
+                date = i.find("span", class_="pl-15 bl-1-666").getText()
+                links = i.find("a", class_="fz-20 lh-22 fw-b", href=True)
+                link = links["href"]
+                articles_data["articles"].append(
+                    {
+                        "title": name,
+                        "description": desc,
+                        "image": image,
+                        "author": author,
+                        "date": date,
+                        "link": link,
+                    }
+                )
+            return articles_data
+        except ValueError:
+            error_message = {
+                "message": "Can't fetch any articles from the topic provided."
+            }
+            ejson = json.dumps(error_message)
+            return ejson

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+beautifulsoup4`
	`2`	`+requests`
	`3`	`+json`