22from bs4 import BeautifulSoup
33
44from pysmartprice .results import SmartPriceResult
5- from pysmartprice .helpers import scrape
5+ from pysmartprice .helpers import scrape , scrape_helper
66from pysmartprice import constants
77
88
9- class PriceListParser (object ):
9+ class BaseParser (object ):
1010 def __init__ (self , mapper , ** kwargs ):
1111 self .mapper = mapper
12+ self .params = kwargs
1213 self .url = constants .URL_MAPPER [self .mapper ]
13- self .response = scrape (self ._make_url (self .url ))
14+ self .response = scrape (self ._make_url (self .url ), ** kwargs )
1415 self .soup = BeautifulSoup (self .response , 'lxml' )
1516 self .result = [
1617 SmartPriceResult (self .get_product_attrs (item ))
@@ -21,17 +22,14 @@ def _make_url(self, target):
2122 return '{}{}' .format (constants .SMARTPRICE_WEB_URL , target )
2223
2324 @property
24- def get_page_range (self ):
25- page_range = self .soup . findAll (
26- 'span' , attrs = { 'class' : 'pgntn__rslt-page' } )
25+ def price_results (self ):
26+ if self .get_page_range :
27+ return self . process_multiple_pages ( )
2728
28- if not page_range :
29- return None
29+ return self .result
3030
31- first_page = int (page_range [0 ].text )
32- last_page = int (page_range [1 ].text )
33- return first_page , last_page
3431
32+ class ParserMixin (object ):
3533 def get_product_attrs (self , item ):
3634 return dict (
3735 img = item .find ('img' ).get ('src' ),
@@ -48,12 +46,6 @@ def products_html(self):
4846 html = self .soup .findAll ('div' , attrs = {'class' : 'prdct-item' })
4947 return html
5048
51- @property
52- def get_paged_url (self ):
53- i = self .url .find (self .mapper )
54- paged_url = '{}pages/{}' .format (self .url [:i ], self .url [i :])
55- return paged_url
56-
5749 def process_multiple_pages (self ):
5850 results = self .result
5951 first_page , last_page = self .get_page_range
@@ -62,12 +54,15 @@ def process_multiple_pages(self):
6254
6355 for page in range (first_page + 1 , last_page + 1 ):
6456 url = paged_url .replace ('.html' , '-{}.html' .format (page ))
65- page_urls .append (self ._make_url (url ))
57+ params = self .params .copy ()
58+ if self .params .get ('page' , None ):
59+ params .update ({'page' : page })
60+ page_urls .append ((self ._make_url (url ), params ))
6661
6762 # Scrape pages in parallel
6863 pool = multiprocessing .Pool (processes = multiprocessing .cpu_count ()* 2 )
6964
70- for page in pool .map (scrape , page_urls ):
65+ for page in pool .map (scrape_helper , page_urls ):
7166 self .soup = BeautifulSoup (page , 'lxml' )
7267
7368 results += [
@@ -77,8 +72,13 @@ def process_multiple_pages(self):
7772 return results
7873
7974 @property
80- def price_results (self ):
81- if self .get_page_range :
82- return self . process_multiple_pages ( )
75+ def get_page_range (self ):
76+ page_range = self .soup . findAll (
77+ 'span' , attrs = { 'class' : 'pgntn__rslt-page' } )
8378
84- return self .result
79+ if not page_range :
80+ return None
81+
82+ first_page = int (page_range [0 ].text )
83+ last_page = int (page_range [1 ].text )
84+ return first_page , last_page
0 commit comments