Skip to content

Commit 536cbdc

Browse files
committed
Web_Scraper Added
1 parent 7df8fcd commit 536cbdc

File tree

5 files changed

+84
-0
lines changed

5 files changed

+84
-0
lines changed
1.76 KB
Loading
85.5 KB
Loading

scripts/Web_Scrapper/readme.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Python Web Scraper
2+
3+
## About
4+
5+
A python script which returns' google searches and gives you titles, description and links to those searches.
6+
7+
## Setup
8+
9+
- Install Python3 for Windows.
10+
- Open Windows Command Prompt.
11+
- git clone https://github.com/GDSC-RCCIIT/General-Purpose-Scripts.git.
12+
- Navigate inside the cd General-Purpose-Scripts/scripts/Web_Scrapper directory.
13+
- Run using
14+
15+
```
16+
pip install -r requirements.txt
17+
python web_scraper.py
18+
19+
```
20+
21+
## Setup
22+
23+
- Script will ask for input: Enter whatever you want to search eg. Amazon
24+
<img src="./images/web_scraper-1.PNG" width="500"/>
25+
<img src="./images/web_scraper-2.PNG" width="500"/>
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
beautifulsoup4==4.10.0
2+
fake-useragent==0.1.11
3+
urllib3==1.26.7
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import urllib
2+
import requests
3+
from fake_useragent import UserAgent
4+
from bs4 import BeautifulSoup
5+
import re
6+
7+
query = input("Enter what you want to search : ")
8+
query = urllib.parse.quote_plus(query) # Format into URL encoding
9+
number_result = 50 # Give result upto 50
10+
ua = UserAgent()
11+
google_url = "https://www.google.com/search?q=" + \
12+
query + "&num=" + str(number_result)
13+
response = requests.get(google_url, {"User-Agent": ua.random})
14+
soup = BeautifulSoup(response.text, "html.parser")
15+
16+
result_div = soup.find_all('div', attrs={'class': 'ZINbbc'})
17+
18+
links = [] #Links to results
19+
titles = [] #Title of results
20+
descriptions = [] #Description about result
21+
for r in result_div:
22+
# Checks if each element is present, else, raise exception
23+
try:
24+
link = r.find('a', href=True)
25+
title = r.find('div', attrs={'class': 'vvjwJb'}).get_text()
26+
description = r.find('div', attrs={'class': 's3v9rd'}).get_text()
27+
28+
# Check to make sure everything is present before appending
29+
if link != '' and title != '' and description != '':
30+
links.append(link['href'])
31+
titles.append(title)
32+
descriptions.append(description)
33+
# Next loop if one element is not present
34+
except:
35+
continue
36+
37+
to_remove = []
38+
clean_links = []
39+
for i, l in enumerate(links):
40+
clean = re.search('\/url\?q\=(.*)\&sa', l)
41+
42+
# Anything that doesn't fit the above pattern will be removed
43+
if clean is None:
44+
to_remove.append(i)
45+
continue
46+
clean_links.append(clean.group(1))
47+
48+
# Remove the corresponding titles & descriptions
49+
for x in to_remove:
50+
del titles[x]
51+
del descriptions[x]
52+
for i in range(0, len(clean_links)):
53+
print(titles[i])
54+
print(descriptions[i])
55+
print(clean_links[i])
56+
print()

0 commit comments

Comments
 (0)