forked from CuriousLearner/GeeksForGeeksScrapper
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathg4g.py
More file actions
executable file
·116 lines (101 loc) · 4.41 KB
/
g4g.py
File metadata and controls
executable file
·116 lines (101 loc) · 4.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/python
import requests
from bs4 import BeautifulSoup
from os import system
from sys import exit
from time import sleep
from requests.exceptions import ConnectionError
from article import Article
BASE_URL = 'http://www.geeksforgeeks.org/'
articles = []
choice_to_category = {1: 'c', 2: 'c-plus-plus', 3: 'java', 4: 'python',
5: 'fundamentals-of-algorithms',
6: 'data-structures'}
def display_menu():
print("Choose category to scrape: ")
print("1. C Language")
print("2. C++ Language")
print("3. Java")
print("4. Python")
print("5. Algorithms")
print("6. Data Structures")
def get_category_choice():
choice = int(raw_input("Enter choice: "))
try:
categoryUrl = choice_to_category[choice]
except KeyError:
print("Wrong Choice Entered. Exiting!")
exit(1)
return categoryUrl
def save_articles_as_html_and_pdf():
print("All links scraped, extracting articles")
# Formatting the html for articles
allArticles = ('<!DOCTYPE html>'
'<html><head>'
'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
'<link rel="stylesheet" href="style.min.css" type="text/css" media="all" />'
'<script src="https://cdn.rawgit.com/google/code-prettify/master/loader/run_prettify.js"></script>'
'</head><body>'
)
allArticles += '<h1 style="text-align:center;font-size:40px">' + categoryUrl.title() + ' Archive</h1><hr>'
allArticles += '<h1 style="padding-left:5%;font-size:200%;"> Index</h1><Br/>'
for x in range(len(articles)):
allArticles += '<a href =\"#' + str(x+1) + '\">' + '<h1 style="padding-left:5%;font-size:20px;">' + str(x+1) + ".\t\t"+articles[x].title + '</h1></a> <br/>'
for x in range(len(articles)):
allArticles += '<hr id=\"' + str(x+1) +'\">' + articles[x].content.decode("utf-8")
allArticles += '''
</body></html>
'''
html_file_name = 'G4G_' + categoryUrl.title() + '.html'
Html_file = open(html_file_name, "w")
Html_file.write(allArticles.encode("utf-8"))
Html_file.close()
pdf_file_name = 'G4G_' + categoryUrl.title() + '.pdf'
print("Generating PDF " + pdf_file_name)
html_to_pdf_command = 'wkhtmltopdf ' + html_file_name + ' ' + pdf_file_name
system(html_to_pdf_command)
def scrape_category(categoryUrl):
try:
soup = BeautifulSoup(requests.get(BASE_URL + categoryUrl).text)
except ConnectionError:
print("Couldn't connect to Internet! Please check your connection & Try again.")
exit(1)
# Selecting links which are in the category page
links = [a.attrs.get('href') for a in soup.select('article li a')]
# Removing links for the categories with anchor on same page
links = [link for link in links if not link.startswith('#')]
print("Found: " + str(len(links)) + " links")
i = 1
# Traverse each link to find article and save it.
for link in links:
try:
if(i % 10 == 0):
sleep(5) # Sleep for 5 seconds before scraping every 10th link
link = link.strip()
print("Scraping link no: " + str(i) + " Link: " + link)
i = i + 1
link_soup = BeautifulSoup(requests.get(link).text)
# Remove the space occupied by Google Ads (Drop script & ins node)
[script.extract() for script in link_soup(["script", "ins"])]
for code_tag in link_soup.find_all('pre'):
code_tag['class'] = code_tag.get('class', []) + ['prettyprint']
article = link_soup.find('article')
# Now add this article to list of all articles
page = Article(title=link_soup.title.string, content=article.encode('UTF-8'))
articles.append(page)
# Sometimes hanging. So Ctrl ^ C, and try the next link.
# Find out the reason & improve this.
except KeyboardInterrupt:
continue
except ConnectionError:
print("Internet disconnected! Please check your connection & Try again.")
if articles:
print("Making PDF of links scraped till now.")
break
else:
exit(1)
if __name__ == '__main__':
display_menu()
categoryUrl = get_category_choice()
scrape_category(categoryUrl)
save_articles_as_html_and_pdf()