Skip to content

Commit 60ebba9

Browse files
authored
Merge pull request #362 from Shreyanshmmgn/shreyansh_web_scrapper
Web_Scraper Added
2 parents cb0b633 + 02dc6e8 commit 60ebba9

File tree

5 files changed

+83
-0
lines changed

5 files changed

+83
-0
lines changed
1.76 KB
Loading
85.5 KB
Loading

scripts/Web_Scrapper/readme.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Python Web Scraper
2+
3+
## About
4+
5+
A python script which returns' google searches and gives you titles, description and links to those searches.
6+
7+
## Setup
8+
9+
- Install Python3 for Windows.
10+
- Open Windows Command Prompt.
11+
- git clone https://github.com/GDSC-RCCIIT/General-Purpose-Scripts.git.
12+
- Navigate inside the cd General-Purpose-Scripts/scripts/Web_Scrapper directory.
13+
- Run using
14+
15+
```
16+
pip install -r requirements.txt
17+
python web_scraper.py
18+
19+
```
20+
21+
## Setup
22+
23+
- Script will ask for input: Enter whatever you want to search eg. Amazon
24+
<img src="./images/web_scraper-1.PNG" width="500"/>
25+
<img src="./images/web_scraper-2.PNG" width="500"/>
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
beautifulsoup4==4.10.0
2+
fake-useragent==0.1.11
3+
urllib3==1.26.7
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import urllib
2+
import requests
3+
from fake_useragent import UserAgent
4+
from bs4 import BeautifulSoup
5+
import re
6+
7+
query = input("Enter what you want to search : ")
8+
query = urllib.parse.quote_plus(query) # Format into URL encoding
9+
number_result = 50 # Give result upto 50
10+
ua = UserAgent()
11+
google_url = "https://www.google.com/search?q=" + query + "&num=" + str(number_result)
12+
response = requests.get(google_url, {"User-Agent": ua.random})
13+
soup = BeautifulSoup(response.text, "html.parser")
14+
15+
result_div = soup.find_all("div", attrs={"class": "ZINbbc"})
16+
17+
links = [] # Links to results
18+
titles = [] # Title of results
19+
descriptions = [] # Description about result
20+
for r in result_div:
21+
# Checks if each element is present, else, raise exception
22+
try:
23+
link = r.find("a", href=True)
24+
title = r.find("div", attrs={"class": "vvjwJb"}).get_text()
25+
description = r.find("div", attrs={"class": "s3v9rd"}).get_text()
26+
27+
# Check to make sure everything is present before appending
28+
if link != "" and title != "" and description != "":
29+
links.append(link["href"])
30+
titles.append(title)
31+
descriptions.append(description)
32+
# Next loop if one element is not present
33+
except:
34+
continue
35+
36+
to_remove = []
37+
clean_links = []
38+
for i, l in enumerate(links):
39+
clean = re.search("\/url\?q\=(.*)\&sa", l)
40+
41+
# Anything that doesn't fit the above pattern will be removed
42+
if clean is None:
43+
to_remove.append(i)
44+
continue
45+
clean_links.append(clean.group(1))
46+
47+
# Remove the corresponding titles & descriptions
48+
for x in to_remove:
49+
del titles[x]
50+
del descriptions[x]
51+
for i in range(0, len(clean_links)):
52+
print(titles[i])
53+
print(descriptions[i])
54+
print(clean_links[i])
55+
print()

0 commit comments

Comments
 (0)