Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
Binary file added __pycache__/trustplt.cpython-37.pyc
Binary file not shown.
6 changes: 3 additions & 3 deletions config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"starting_page": "/review/stuart.com?b=MTYyMTE4NDQ3NzAwMHw2MGExNGZkZGY5ZjQ4NzBiNzAwNTQwMTc",
"starting_page": "/review/www.deliveroo.co.uk",
"steps": 4,
"source_url": "https://uk.trustpilot.com",
"company": "Stuart"
}
"company": "Deliveroo"
}
Binary file modified helpers/__pycache__/utilities.cpython-37.pyc
Binary file not shown.
Binary file added pilot_versions/.DS_Store
Binary file not shown.
54 changes: 54 additions & 0 deletions pilot_versions/trustpilot_debbuger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import pandas as pd
import urllib.request
import time
import re
from dateutil.parser import isoparse
from datetime import datetime
from typing import List, Mapping
from bs4 import (BeautifulSoup,
element)

from helpers.utilities import retrieve_processed_pages, NoDataRetrievedError



def retrieve_reviews(reviews_html: BeautifulSoup,
rvw_section_att='typography_typography__QgicV typography_body__9UBeQ typography_color-black__5LYEn typography_weight-regular__TWEnf typography_fontstyle-normal__kHyN3') -> element.ResultSet:
"""
The function returns an element.ResultSet, where each element is a tag
that contain all the information of the reviews. The ResultSet has a length
of 20. A 'review-card' element corresponds to a separate review.
"""
return reviews_html.find_all('div', attrs={'class': rvw_section_att})

def reviews_page_to_html(target_url: str) -> BeautifulSoup:
"""
Given a website link (URL), retrieve the corresponding website in an html
format.

Parameters
----------
target_url : str
URL of the webpage that will be transformed to a HTML object.
"""
#print('Attempting to retrieve HTML object for {0}'.format(target_url))
request = urllib.request.urlopen(target_url)
if request.getcode() != 200:
raise Exception('Can not communicate with the client')
else:
response = request.read()
response_html = BeautifulSoup(response, 'html.parser')
return response_html

page = 'https://uk.trustpilot.com/review/www.deliveroo.co.uk'

page_html = reviews_page_to_html(page)
retrieve_reviews(page_html)



nav = page_html.find_all('div', attrs={'class': 'styles_mainContent__nFxAv'})

nav = nav[0].find_all('section', attrs={'class': 'styles_reviewsContainer__3_GQw'})

nav[0].fina_all('div', attrs={'class': "paper_paper__1PY90 paper_square__lJX8a card_card__lQWDv card_noPadding__D8PcU styles_cardWrapper__772_o styles_show__FYIO3 styles_reviewCard__9HxJJ"})
7 changes: 4 additions & 3 deletions trustplt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

import pandas as pd
import urllib
import urllib.request
import time
import re
from dateutil.parser import isoparse
Expand Down Expand Up @@ -197,12 +197,12 @@ def trustplt_sniffer(base_domain: str,
pages_ls = []
landing_page = base_domain + starting_page
processed_pages = retrieve_processed_pages(processed_urls_f)

with open(processed_urls_f, 'a') as file:
while steps != 0:
reviews_page_html = reviews_page_to_html(landing_page)
try:
page = retrieve_next_page(reviews_page_html)
print('processing: {0}'.format(page))
reviews = retrieve_reviews(reviews_page_html)
df = reviews_page_to_df(reviews,
ratings_dict=ratings_dict,
Expand All @@ -216,7 +216,8 @@ def trustplt_sniffer(base_domain: str,
landing_page = base_domain + page
steps -= 1
time.sleep(1)
except IndexError:
except IndexError as e:
print(e)
pass
file.close()

Expand Down