recodehive
diff --git a/‎amazon_scrapping/Amazon-dataset/Product listing.csv
Lines changed: 3155 additions & 0 deletions b/‎amazon_scrapping/Amazon-dataset/Product listing.csv
Lines changed: 3155 additions & 0 deletions
diff --git a/‎amazon_scrapping/Amazon-dataset/Product listing.xlsx
148 KB b/‎amazon_scrapping/Amazon-dataset/Product listing.xlsx
148 KB
diff --git a/‎amazon_scrapping/README.md
Lines changed: 59 additions & 0 deletions b/‎amazon_scrapping/README.md
Lines changed: 59 additions & 0 deletions
diff --git a/‎amazon_scrapping/data_scrapped/Product listing.csv
Lines changed: 3157 additions & 0 deletions b/‎amazon_scrapping/data_scrapped/Product listing.csv
Lines changed: 3157 additions & 0 deletions
diff --git a/‎amazon_scrapping/data_scrapped/test.csv
Lines changed: 395 additions & 0 deletions b/‎amazon_scrapping/data_scrapped/test.csv
Lines changed: 395 additions & 0 deletions
diff --git a/‎amazon_scrapping/data_scrapped/train.csv
Lines changed: 2759 additions & 0 deletions b/‎amazon_scrapping/data_scrapped/train.csv
Lines changed: 2759 additions & 0 deletions
diff --git a/‎amazon_scrapping/scrapping.py
Lines changed: 58 additions & 0 deletions b/‎amazon_scrapping/scrapping.py
Lines changed: 58 additions & 0 deletions
@@ -0,0 +1,59 @@
+
+
+<h1 align="center">Amazon Scrapping</h1>
+<blockquote align="center">Scrapping the product lisitng✏️ using python programming language💻.  </blockquote>
+<p align="center">For new data generation <b>Classification part</b> we have writtern a python script to fetch📊, data from the 💻, Amazon website 🌐 and converted into csv files. </p>
+
+
+
+# Introduction
+
+**`Semi-supervised-sequence-learning-Project`** :computer: replication process is done over here and for further analysis creation of new data is required.
+
+- The following script includes the following.
+- `scrapping.py` - Script to scrap the data from Amazon website
+- Product label listing: `Laptop`, `Phones`, `Printers`, `Desktops`, `Monitors`, `Mouse`, `Pendrive`, `Earphones`, `Smart TV`, `Power banks`
+
+
+
+## Dependencies
+
+- Install Selenium using `pip install -U selenium`
+
+- Install Python 3 using the [MSI available in python.org download page](http://www.python.org/download).
+
+- Load the drivers
+
+ ```python
+  from selenium import webdriver
+  
+  browser = webdriver.Firefox()
+  browser.get('http://selenium.dev/')
+ ```
+
+- Selenium Server (optional)
+
+ ```python
+  java -jar selenium-server-standalone-4.0.0.jar
+ ```
+
+## Installation
+
+**1️⃣ Fork the `Semi-supervised-sequence-learning-Project/` repository**  
+Follow these instructions on [how to fork a repository](https://help.github.com/en/articles/fork-a-repo)
+
+**2️⃣ Cloning the repository**  
+Once you have set up your fork of the `/Semi-supervised-sequence-learning-Project` repository, you'll want to clone it to your local machine. This is so you can make and test all of your personal edits before adding it to the master version of `/Semi-supervised-sequence-learning-Project`.
+
+Navigate to the location on your computer where you want to host your code. Once in the appropriate folder, run the following command to clone the repository to your local machine.
+
+```bash
+git clone https://github.com/sanjay-kv/Semi-supervised-sequence-learning-Project.git
+```
+
+## Final Dataset
+
+1️⃣ Here is the Link to **Final Dataset:** [Drive Link](https://drive.google.com/drive/folders/1HB8FCUVqkQpSbV7syq2ZsZ59kT5B-SGo?usp=sharing)
+
+
+
@@ -0,0 +1,58 @@
+# product name
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import time
+import json
+import csv
+import pandas as pd
+
+## One way to load chrome webdirver
+#from webdriver_manager.chrome import ChromeDriverManager
+#driver = webdriver.Chrome(ChromeDriverManager().install())
+
+## another way to load chrome webdriver
+path = '/Users/mohammedrizwan/Downloads/chromedriver'
+driver = webdriver.Chrome(path)
+
+def product_listing(txt):
+
+    driver.get("https://www.amazon.in/")
+    driver.implicitly_wait(2)
+    search = driver.find_element_by_id('twotabsearchtextbox').send_keys(txt)
+    driver.implicitly_wait(2)
+    search_button = driver.find_element_by_id('nav-search-submit-button').click()
+    driver.implicitly_wait(5)
+
+    items = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, '//a[@class="a-link-normal a-text-normal"]')))
+
+    for item in items:
+        name_list.append(item.text)
+
+    driver.implicitly_wait(5)
+    c1 = driver.find_element_by_class_name("a-pagination")
+    c2 = c1.text
+    c3 = c2.splitlines()
+    num_of_pg = c3[-2]
+
+    for i in range(int(num_of_pg)-5):
+        print(i)
+        items = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, '//a[@class="a-link-normal a-text-normal"]')))
+        for item in items:
+            name_list.append(item.text)
+        link = driver.find_element_by_class_name("a-section.a-spacing-none.a-padding-base")
+        next_lin = link.find_element_by_class_name("a-last").find_element_by_tag_name("a").get_attribute("href")
+        driver.get(next_lin)
+        driver.implicitly_wait(2)
+
+
+names = ['Laptop', 'Phones', 'Printers', 'Desktops', 'Monitors', 'Mouse', 'Pendrive', 'Earphones', 'Smart TV', 'Power banks']
+name_list = []
+for i in names:
+    product_listing(i)
+df=pd.DataFrame(name_list)
+df.to_csv('./prod_listings.csv')
+print(df)
+driver.quit()
+