diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..f1f5069 Binary files /dev/null and b/.DS_Store differ diff --git a/__pycache__/trustplt.cpython-37.pyc b/__pycache__/trustplt.cpython-37.pyc new file mode 100644 index 0000000..40c4a56 Binary files /dev/null and b/__pycache__/trustplt.cpython-37.pyc differ diff --git a/config.json b/config.json index 89e4ff5..881e8f9 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { - "starting_page": "/review/stuart.com?b=MTYyMTE4NDQ3NzAwMHw2MGExNGZkZGY5ZjQ4NzBiNzAwNTQwMTc", + "starting_page": "/review/www.deliveroo.co.uk", "steps": 4, "source_url": "https://uk.trustpilot.com", - "company": "Stuart" -} \ No newline at end of file + "company": "Deliveroo" +} diff --git a/helpers/__pycache__/utilities.cpython-37.pyc b/helpers/__pycache__/utilities.cpython-37.pyc index bca8bdc..31b108d 100644 Binary files a/helpers/__pycache__/utilities.cpython-37.pyc and b/helpers/__pycache__/utilities.cpython-37.pyc differ diff --git a/pilot_versions/.DS_Store b/pilot_versions/.DS_Store new file mode 100644 index 0000000..60c94ef Binary files /dev/null and b/pilot_versions/.DS_Store differ diff --git a/pilot_versions/trustpilot_debbuger.py b/pilot_versions/trustpilot_debbuger.py new file mode 100755 index 0000000..3121e8d --- /dev/null +++ b/pilot_versions/trustpilot_debbuger.py @@ -0,0 +1,54 @@ +import pandas as pd +import urllib.request +import time +import re +from dateutil.parser import isoparse +from datetime import datetime +from typing import List, Mapping +from bs4 import (BeautifulSoup, + element) + +from helpers.utilities import retrieve_processed_pages, NoDataRetrievedError + + + +def retrieve_reviews(reviews_html: BeautifulSoup, + rvw_section_att='typography_typography__QgicV typography_body__9UBeQ typography_color-black__5LYEn typography_weight-regular__TWEnf typography_fontstyle-normal__kHyN3') -> element.ResultSet: + """ + The function returns an element.ResultSet, where each element is a tag + that contain all the information of the reviews. The ResultSet has a length + of 20. A 'review-card' element corresponds to a separate review. + """ + return reviews_html.find_all('div', attrs={'class': rvw_section_att}) + +def reviews_page_to_html(target_url: str) -> BeautifulSoup: + """ + Given a website link (URL), retrieve the corresponding website in an html + format. + + Parameters + ---------- + target_url : str + URL of the webpage that will be transformed to a HTML object. + """ + #print('Attempting to retrieve HTML object for {0}'.format(target_url)) + request = urllib.request.urlopen(target_url) + if request.getcode() != 200: + raise Exception('Can not communicate with the client') + else: + response = request.read() + response_html = BeautifulSoup(response, 'html.parser') + return response_html + +page = 'https://uk.trustpilot.com/review/www.deliveroo.co.uk' + +page_html = reviews_page_to_html(page) +retrieve_reviews(page_html) + + + +nav = page_html.find_all('div', attrs={'class': 'styles_mainContent__nFxAv'}) + +nav = nav[0].find_all('section', attrs={'class': 'styles_reviewsContainer__3_GQw'}) + +nav[0].fina_all('div', attrs={'class': "paper_paper__1PY90 paper_square__lJX8a card_card__lQWDv card_noPadding__D8PcU styles_cardWrapper__772_o styles_show__FYIO3 styles_reviewCard__9HxJJ"}) \ No newline at end of file diff --git a/trustplt.py b/trustplt.py index e2562f6..a8423bd 100644 --- a/trustplt.py +++ b/trustplt.py @@ -1,6 +1,6 @@ import pandas as pd -import urllib +import urllib.request import time import re from dateutil.parser import isoparse @@ -197,12 +197,12 @@ def trustplt_sniffer(base_domain: str, pages_ls = [] landing_page = base_domain + starting_page processed_pages = retrieve_processed_pages(processed_urls_f) - with open(processed_urls_f, 'a') as file: while steps != 0: reviews_page_html = reviews_page_to_html(landing_page) try: page = retrieve_next_page(reviews_page_html) + print('processing: {0}'.format(page)) reviews = retrieve_reviews(reviews_page_html) df = reviews_page_to_df(reviews, ratings_dict=ratings_dict, @@ -216,7 +216,8 @@ def trustplt_sniffer(base_domain: str, landing_page = base_domain + page steps -= 1 time.sleep(1) - except IndexError: + except IndexError as e: + print(e) pass file.close()