GDSC-RCCIIT
diff --git a/‎scripts/Web_Scrapper/images/web_scraper-1.PNG‎
1.76 KB b/‎scripts/Web_Scrapper/images/web_scraper-1.PNG‎
1.76 KB
diff --git a/‎scripts/Web_Scrapper/images/web_scraper-2.PNG‎
85.5 KB b/‎scripts/Web_Scrapper/images/web_scraper-2.PNG‎
85.5 KB
diff --git a/‎scripts/Web_Scrapper/readme.md‎
Lines changed: 25 additions & 0 deletions b/‎scripts/Web_Scrapper/readme.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎scripts/Web_Scrapper/requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎scripts/Web_Scrapper/requirements.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎scripts/Web_Scrapper/web_scraper.py‎
Lines changed: 56 additions & 0 deletions b/‎scripts/Web_Scrapper/web_scraper.py‎
Lines changed: 56 additions & 0 deletions
@@ -0,0 +1,25 @@
+# Python Web Scraper
+
+## About
+
+A python script which returns' google searches and gives you titles, description and links to those searches.
+
+## Setup
+
+- Install Python3 for Windows.
+- Open Windows Command Prompt.
+- git clone https://github.com/GDSC-RCCIIT/General-Purpose-Scripts.git.
+- Navigate inside the cd General-Purpose-Scripts/scripts/Web_Scrapper directory.
+- Run using
+
+```
+pip install -r requirements.txt
+python web_scraper.py
+
+```
+
+## Setup
+
+- Script will ask for input: Enter whatever you want to search eg. Amazon
+  <img src="./images/web_scraper-1.PNG" width="500"/>
+  <img src="./images/web_scraper-2.PNG" width="500"/>
@@ -0,0 +1,3 @@
+beautifulsoup4==4.10.0
+fake-useragent==0.1.11
+urllib3==1.26.7
@@ -0,0 +1,56 @@
+import urllib
+import requests
+from fake_useragent import UserAgent
+from bs4 import BeautifulSoup
+import re
+
+query = input("Enter what you want to search : ")
+query = urllib.parse.quote_plus(query)  # Format into URL encoding
+number_result = 50                      # Give result upto 50
+ua = UserAgent()
+google_url = "https://www.google.com/search?q=" + \
+    query + "&num=" + str(number_result)
+response = requests.get(google_url, {"User-Agent": ua.random})
+soup = BeautifulSoup(response.text, "html.parser")
+
+result_div = soup.find_all('div', attrs={'class': 'ZINbbc'})
+
+links = []  #Links to results
+titles = [] #Title of results
+descriptions = [] #Description about result
+for r in result_div:
+    # Checks if each element is present, else, raise exception
+    try:
+        link = r.find('a', href=True)
+        title = r.find('div', attrs={'class': 'vvjwJb'}).get_text()
+        description = r.find('div', attrs={'class': 's3v9rd'}).get_text()
+
+        # Check to make sure everything is present before appending
+        if link != '' and title != '' and description != '':
+            links.append(link['href'])
+            titles.append(title)
+            descriptions.append(description)
+    # Next loop if one element is not present
+    except:
+        continue
+
+to_remove = []
+clean_links = []
+for i, l in enumerate(links):
+    clean = re.search('\/url\?q\=(.*)\&sa', l)
+
+    # Anything that doesn't fit the above pattern will be removed
+    if clean is None:
+        to_remove.append(i)
+        continue
+    clean_links.append(clean.group(1))
+
+# Remove the corresponding titles & descriptions
+for x in to_remove:
+    del titles[x]
+    del descriptions[x]
+for i in range(0, len(clean_links)):
+    print(titles[i])
+    print(descriptions[i])
+    print(clean_links[i])
+    print()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+beautifulsoup4==4.10.0`
	`2`	`+fake-useragent==0.1.11`
	`3`	`+urllib3==1.26.7`