Merge pull request #1921 from Juhibhojani/master

Yashbhadiyadra · web-flow · commit 1c1dbaa02696 · 2023-06-24T11:48:36.000+05:30
[GSSoC'23] TOI Scrapper
diff --git a/TOI_Scrapper/README.MD b/TOI_Scrapper/README.MD
@@ -0,0 +1,2 @@
+# Times of India Scrapper
+The news headlines as well as links for the same are scrapped and output is presented in the form of a dataframe 
diff --git a/TOI_Scrapper/TOI_Scrapper.py b/TOI_Scrapper/TOI_Scrapper.py
@@ -0,0 +1,28 @@
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+
+def scrapper():
+    headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}
+    #accessing TOI webpage disguised as a browser
+    webpage=requests.get('https://timesofindia.indiatimes.com/',headers=headers).text
+
+    soup=BeautifulSoup(webpage,'lxml')
+    news = []
+    link_list = []
+
+    for i in soup.find_all('div', class_='col_l_6'):
+        figcaption = i.find('figcaption')
+        if figcaption is not None:
+            #finding news headline as well its corresponding link
+            link_news = i.find('a').get("href")
+            text_news = figcaption.text.strip()
+
+            news.append(text_news)
+            link_list.append(link_news)
+    df = pd.DataFrame({'News_Headline': news, 'News_Link': link_list})
+    return df
+
+TOI_headline = scrapper()
+print(TOI_headline)
diff --git a/TOI_Scrapper/requirements.txt b/TOI_Scrapper/requirements.txt
@@ -0,0 +1,2 @@
+requests
+beautifulsoup4

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Times of India Scrapper`
	`2`	`+The news headlines as well as links for the same are scrapped and output is presented in the form of a dataframe`