gpsyrou · gpsyrou · Dec 2, 2021 · Dec 20, 2021
diff --git a/.DS_Store b/.DS_Store
diff --git a/__pycache__/trustplt.cpython-37.pyc b/__pycache__/trustplt.cpython-37.pyc
diff --git a/config.json b/config.json
@@ -1,6 +1,6 @@
 {
-    "starting_page": "/review/stuart.com?b=MTYyMTE4NDQ3NzAwMHw2MGExNGZkZGY5ZjQ4NzBiNzAwNTQwMTc",
+    "starting_page": "/review/www.deliveroo.co.uk",
     "steps": 4,
     "source_url": "https://uk.trustpilot.com",
-    "company": "Stuart"
-}
+    "company": "Deliveroo"
+}
diff --git a/helpers/__pycache__/utilities.cpython-37.pyc b/helpers/__pycache__/utilities.cpython-37.pyc
diff --git a/pilot_versions/.DS_Store b/pilot_versions/.DS_Store
diff --git a/pilot_versions/trustpilot_debbuger.py b/pilot_versions/trustpilot_debbuger.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import urllib.request
+import time
+import re
+from dateutil.parser import isoparse
+from datetime import datetime
+from typing import List, Mapping
+from bs4 import (BeautifulSoup,
+                 element)
+
+from helpers.utilities import retrieve_processed_pages, NoDataRetrievedError
+
+
+
+def retrieve_reviews(reviews_html: BeautifulSoup,
+                     rvw_section_att='typography_typography__QgicV typography_body__9UBeQ typography_color-black__5LYEn typography_weight-regular__TWEnf typography_fontstyle-normal__kHyN3') -> element.ResultSet:
+    """
+    The function returns an element.ResultSet, where each element is a tag
+    that contain all the information of the reviews. The ResultSet has a length
+    of 20. A 'review-card' element corresponds to a separate review.
+    """
+    return reviews_html.find_all('div', attrs={'class': rvw_section_att})
+
+def reviews_page_to_html(target_url: str) -> BeautifulSoup:
+    """
+    Given a website link (URL), retrieve the corresponding website in an html
+    format.
+
+    Parameters
+    ----------
+    target_url : str
+        URL of the webpage that will be transformed to a HTML object.
+    """
+    #print('Attempting to retrieve HTML object for {0}'.format(target_url))
+    request = urllib.request.urlopen(target_url)
+    if request.getcode() != 200:
+        raise Exception('Can not communicate with the client')        
+    else:
+        response = request.read()
+        response_html = BeautifulSoup(response, 'html.parser')
+        return response_html
+
+page = 'https://uk.trustpilot.com/review/www.deliveroo.co.uk'
+
+page_html = reviews_page_to_html(page)
+retrieve_reviews(page_html)
+
+
+
+nav = page_html.find_all('div', attrs={'class': 'styles_mainContent__nFxAv'})
+
+nav = nav[0].find_all('section', attrs={'class': 'styles_reviewsContainer__3_GQw'})
+
+nav[0].fina_all('div', attrs={'class': "paper_paper__1PY90 paper_square__lJX8a card_card__lQWDv card_noPadding__D8PcU styles_cardWrapper__772_o styles_show__FYIO3 styles_reviewCard__9HxJJ"})
diff --git a/trustplt.py b/trustplt.py
@@ -1,6 +1,6 @@
 
 import pandas as pd
-import urllib
+import urllib.request
 import time
 import re
 from dateutil.parser import isoparse
@@ -197,12 +197,12 @@ def trustplt_sniffer(base_domain: str,
     pages_ls = []
     landing_page = base_domain + starting_page
     processed_pages = retrieve_processed_pages(processed_urls_f)
-
     with open(processed_urls_f, 'a') as file:
         while steps != 0:
             reviews_page_html = reviews_page_to_html(landing_page)
             try:
                 page = retrieve_next_page(reviews_page_html)
+                print('processing: {0}'.format(page))
                 reviews = retrieve_reviews(reviews_page_html)
                 df = reviews_page_to_df(reviews,
                                             ratings_dict=ratings_dict,
@@ -216,7 +216,8 @@ def trustplt_sniffer(base_domain: str,
                 landing_page = base_domain + page
                 steps -= 1
                 time.sleep(1)
-            except IndexError:
+            except IndexError as e:
+                print(e)
                 pass
     file.close()