1+ #!/usr/bin/python
2+ #########################################USAGE###############################################
3+ #$Surface Crawler By Vishvendra Singh-Multiprocess approach #
4+ #$crawler_2.0.py 8 http://example.com example.com #
5+ #$crawler_2.0.py workers Full_URL mongo_collection_name #
6+ #Details - Overload is sent to thread 1, rest of the load is distributed as equal to others #
7+ #############################################################################################
8+ '''START IMPORT HERE'''
9+ import multiprocessing
10+ import requests
11+ import urlparse
12+ import sys
13+ import Queue
14+ import lxml
15+ from bs4 import BeautifulSoup
16+ import re
17+ import pymongo
18+ import time
19+ '''END IMPORT HERE'''
20+ '''Do Not Change Below code'''
21+ def getdescription (soup ):
22+ meta_desc = ''
23+ desc = soup .findAll (attrs = {"name" :"description" })
24+ if len (desc ):
25+ meta_desc = desc [0 ]['content' ].encode ('utf-8' )
26+ return meta_desc
27+
28+ def getkeywords (soup ):
29+ meta_keyword = ''
30+ meta_key = soup .findAll (attrs = {"name" :"keywords" })
31+ if len (meta_key ):
32+ meta_keyword = meta_key [0 ]['content' ].encode ('utf-8' )
33+ return meta_keyword
34+
35+ def gettitle (soup ):
36+ title = ''
37+ for title in soup .findAll ('title' ):
38+ title = title .text
39+ return title
40+
41+ def getbodytext (soup ):
42+ body = ''
43+ for elem in soup .findAll (['script' , 'style' ]):
44+ elem .extract ()
45+ for body in soup .findAll ('body' ):
46+ body = body .text
47+ return " " .join (body .split ())
48+
49+ def getphone (html ):
50+ ph_list = []
51+ str_list = re .findall ("(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{5})" , html )
52+ ph_list = str_list = filter (None , str_list )
53+ ph_list = filter (None , ph_list )
54+ return ph_list
55+
56+ def getemail (html ):
57+ email_match = re .findall (r'[\w\.-]+@[\w\.-]+' , html )
58+ return email_match
59+
60+ def geturl (soup ):
61+ urldict = {}
62+ for link in soup .findAll ("a" ):
63+ url_mended = urlparse .urljoin (url_callable , link .get ("href" ))
64+ url_crap = urlparse .urldefrag (url_mended )
65+ url_mended = url_crap [0 ]
66+ urldict [link .text ]= url_mended
67+ return urldict
68+
69+ def storeData (data_dump ):
70+ try :
71+ db [collection ].insert (data_dump ,check_keys = False )
72+ except ValueError :
73+ print "Oops! Dict 1 of functions throw this error..."
74+ '''Do Not Change Above code'''
75+
76+ def worker (i ,no_of_links ):
77+ for i in range (1 ,no_of_links + 1 ):
78+ print ''''''
79+ fetched = q .get_nowait ()
80+ print "--" ,fetched ,"--By Thread-" ,i ,"--"
81+ print "Fetching..." ,fetched
82+ r = requests .get (fetched )
83+ html = r .text
84+ soup = BeautifulSoup (html ,"lxml" )
85+ all_url_dict = geturl (soup )
86+ data_dump = {'url' :str (fetched ),'page_title' :gettitle (soup ),'unix_time' :time .time (),'meta_description' :getdescription (soup ),'meta_keywords' :getkeywords (soup ),'body' :getbodytext (soup ),'phone' :getphone (html ),'email' :getemail (html ),'all_url' :all_url_dict }
87+ try :
88+ storeData (data_dump )
89+ except ValueError :
90+ print "Opps! store funciton error"
91+ print ''''''
92+ return
93+
94+ '''START CRAWLABLE URL '''
95+ url = list (sys .argv )[2 ]
96+ if url .endswith ('/' ):
97+ url = url [:- 1 ]
98+ if urlparse .urlparse (url ).scheme == '' :
99+ url = 'http://' + url
100+ url_clean = urlparse .urlparse (url ).netloc
101+ url_callable = urlparse .urlparse (url ).scheme + '://' + urlparse .urlparse (url ).netloc
102+ print url_callable
103+ '''END CRAWLABLE URL '''
104+ ##mongodb authentication
105+ #conn = pymongo.MongoClient('mongodb://username:password@hostname')
106+ #db=conn.database
107+ ##mongodb authentication
108+ try :
109+ collection = list (sys .argv )[3 ]
110+ except :
111+ collection = url_clean
112+
113+ no_of_threads = int (list (sys .argv )[1 ]) #No of thread define here
114+ q = Queue .Queue ()
115+
116+ user_agent = {'User-agent' : 'Mozilla/5.0' }
117+ r = requests .get (url , headers = user_agent )
118+ html = r .text
119+ soup = BeautifulSoup (html ,"lxml" )
120+
121+ all_url_dict = geturl (soup )
122+ all_url_list = list (all_url_dict .values ())
123+ total_links = len (all_url_list )
124+
125+ for item in all_url_list :
126+ q .put (item )
127+
128+ print "-Queue Size is - " ,q .qsize ()
129+
130+ no_of_links_to_one_thread = q .qsize ()/ no_of_threads
131+ if (no_of_links_to_one_thread < 1 ):
132+ no_of_links_to_one_thread = 1
133+
134+ if (q .qsize ()< no_of_threads ):
135+ no_of_thread = q .qsize ()
136+ else :
137+ no_of_thread = no_of_threads
138+ print no_of_thread ,"--no thread "
139+ total_left = q .qsize ()
140+
141+ for i in range (1 ,no_of_thread + 1 ):
142+ if (i == 1 and q .qsize ()> no_of_threads ):
143+ extra = total_links % no_of_threads
144+ to_send = extra + no_of_links_to_one_thread
145+ else :
146+ to_send = no_of_links_to_one_thread
147+ print to_send ,"to send to thread" ,i
148+ t = multiprocessing .Process (target = worker ,args = (i ,to_send ))
149+ t .start ()
150+
151+ t .join ()
152+ print ''''''
0 commit comments