Skip to content

Commit f8ef534

Browse files
stable version of Multiprocess approach of crawler
stable version of Multiprocess approach of crawler
1 parent d699454 commit f8ef534

File tree

1 file changed

+152
-0
lines changed

1 file changed

+152
-0
lines changed

crawler_2.0_stable.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#!/usr/bin/python
2+
#########################################USAGE###############################################
3+
#$Surface Crawler By Vishvendra Singh-Multiprocess approach #
4+
#$crawler_2.0.py 8 http://example.com example.com #
5+
#$crawler_2.0.py workers Full_URL mongo_collection_name #
6+
#Details - Overload is sent to thread 1, rest of the load is distributed as equal to others #
7+
#############################################################################################
8+
'''START IMPORT HERE'''
9+
import multiprocessing
10+
import requests
11+
import urlparse
12+
import sys
13+
import Queue
14+
import lxml
15+
from bs4 import BeautifulSoup
16+
import re
17+
import pymongo
18+
import time
19+
'''END IMPORT HERE'''
20+
'''Do Not Change Below code'''
21+
def getdescription(soup):
22+
meta_desc=''
23+
desc = soup.findAll(attrs={"name":"description"})
24+
if len(desc):
25+
meta_desc=desc[0]['content'].encode('utf-8')
26+
return meta_desc
27+
28+
def getkeywords(soup):
29+
meta_keyword=''
30+
meta_key = soup.findAll(attrs={"name":"keywords"})
31+
if len(meta_key):
32+
meta_keyword=meta_key[0]['content'].encode('utf-8')
33+
return meta_keyword
34+
35+
def gettitle(soup):
36+
title=''
37+
for title in soup.findAll('title'):
38+
title=title.text
39+
return title
40+
41+
def getbodytext(soup):
42+
body=''
43+
for elem in soup.findAll(['script', 'style']):
44+
elem.extract()
45+
for body in soup.findAll('body'):
46+
body=body.text
47+
return " ".join(body.split())
48+
49+
def getphone(html):
50+
ph_list=[]
51+
str_list=re.findall("(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{5})", html)
52+
ph_list=str_list = filter(None, str_list)
53+
ph_list = filter(None, ph_list)
54+
return ph_list
55+
56+
def getemail(html):
57+
email_match = re.findall(r'[\w\.-]+@[\w\.-]+', html)
58+
return email_match
59+
60+
def geturl(soup):
61+
urldict = {}
62+
for link in soup.findAll("a"):
63+
url_mended=urlparse.urljoin(url_callable, link.get("href"))
64+
url_crap = urlparse.urldefrag(url_mended)
65+
url_mended=url_crap[0]
66+
urldict[link.text]=url_mended
67+
return urldict
68+
69+
def storeData(data_dump):
70+
try:
71+
db[collection].insert(data_dump,check_keys=False)
72+
except ValueError:
73+
print "Oops! Dict 1 of functions throw this error..."
74+
'''Do Not Change Above code'''
75+
76+
def worker(i,no_of_links):
77+
for i in range(1,no_of_links+1):
78+
print ''''''
79+
fetched = q.get_nowait()
80+
print "--",fetched,"--By Thread-",i,"--"
81+
print "Fetching...",fetched
82+
r=requests.get(fetched)
83+
html=r.text
84+
soup=BeautifulSoup(html,"lxml")
85+
all_url_dict=geturl(soup)
86+
data_dump = {'url':str(fetched),'page_title':gettitle(soup),'unix_time':time.time(),'meta_description':getdescription(soup),'meta_keywords':getkeywords(soup),'body':getbodytext(soup),'phone':getphone(html),'email':getemail(html),'all_url':all_url_dict}
87+
try:
88+
storeData(data_dump)
89+
except ValueError:
90+
print "Opps! store funciton error"
91+
print ''''''
92+
return
93+
94+
'''START CRAWLABLE URL '''
95+
url=list(sys.argv)[2]
96+
if url.endswith('/'):
97+
url = url[:-1]
98+
if urlparse.urlparse(url).scheme=='':
99+
url='http://'+url
100+
url_clean=urlparse.urlparse(url).netloc
101+
url_callable=urlparse.urlparse(url).scheme+'://'+urlparse.urlparse(url).netloc
102+
print url_callable
103+
'''END CRAWLABLE URL '''
104+
##mongodb authentication
105+
#conn = pymongo.MongoClient('mongodb://username:password@hostname')
106+
#db=conn.database
107+
##mongodb authentication
108+
try:
109+
collection=list(sys.argv)[3]
110+
except:
111+
collection=url_clean
112+
113+
no_of_threads=int(list(sys.argv)[1]) #No of thread define here
114+
q = Queue.Queue()
115+
116+
user_agent = {'User-agent': 'Mozilla/5.0'}
117+
r=requests.get(url, headers = user_agent)
118+
html=r.text
119+
soup=BeautifulSoup(html,"lxml")
120+
121+
all_url_dict=geturl(soup)
122+
all_url_list=list(all_url_dict.values())
123+
total_links=len(all_url_list)
124+
125+
for item in all_url_list:
126+
q.put(item)
127+
128+
print "-Queue Size is - ",q.qsize()
129+
130+
no_of_links_to_one_thread=q.qsize()/no_of_threads
131+
if(no_of_links_to_one_thread<1):
132+
no_of_links_to_one_thread=1
133+
134+
if(q.qsize()<no_of_threads):
135+
no_of_thread=q.qsize()
136+
else:
137+
no_of_thread=no_of_threads
138+
print no_of_thread,"--no thread "
139+
total_left=q.qsize()
140+
141+
for i in range(1,no_of_thread+1):
142+
if(i==1 and q.qsize()>no_of_threads ):
143+
extra=total_links%no_of_threads
144+
to_send=extra+no_of_links_to_one_thread
145+
else:
146+
to_send=no_of_links_to_one_thread
147+
print to_send,"to send to thread",i
148+
t = multiprocessing.Process(target=worker,args=(i,to_send))
149+
t.start()
150+
151+
t.join()
152+
print ''''''

0 commit comments

Comments
 (0)