Merge pull request #181 from Harshitmishra001/main

sanjay-kv · web-flow · commit 18f26c556b1b · 2024-06-08T22:35:47.000+10:00
Update scrapping.py
diff --git a/amazon_scrapping/scrapping.py b/amazon_scrapping/scrapping.py
@@ -1,59 +1,46 @@
-# product name
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-import time
-import json
-import csv
 import pandas as pd
 
-## One way to load chrome webdirver
-#from webdriver_manager.chrome import ChromeDriverManager
-#driver = webdriver.Chrome(ChromeDriverManager().install())
-
-## another way to load chrome webdriver
-path = '/Users/mohammedrizwan/Downloads/chromedriver'
+path = '/Users/hmharsh/Downloads/chromedriver'
 driver = webdriver.Chrome(path)
 
 def product_listing(txt):
-    name_list = []  # Added this line to define name_list within the function
-    # Rest of the function remains unchanged
+    name_list = []
     driver.get("https://www.amazon.in/")
-    driver.implicitly_wait(2)
-    search = driver.find_element_by_id('twotabsearchtextbox').send_keys(txt)
-    driver.implicitly_wait(2)
-    search_button = driver.find_element_by_id('nav-search-submit-button').click()
-    driver.implicitly_wait(5)
-
-    items = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, '//a[@class="a-link-normal a-text-normal"]')))
-
-    for item in items:
-        name_list.append(item.text)
-
-    driver.implicitly_wait(5)
-    c1 = driver.find_element_by_class_name("a-pagination")
-    c2 = c1.text
-    c3 = c2.splitlines()
-    num_of_pg = c3[-2]
-
-    for i in range(int(num_of_pg)-5):
-        print(i)
-        items = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, '//a[@class="a-link-normal a-text-normal"]')))
+    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'twotabsearchtextbox')))
+    search_box = driver.find_element(By.ID, 'twotabsearchtextbox')
+    search_box.clear()
+    search_box.send_keys(txt)
+    search_button = driver.find_element(By.ID, 'nav-search-submit-button')
+    search_button.click()
+
+    while True:
+        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@class="a-size-medium a-color-base a-text-normal"]')))
+        items = driver.find_elements(By.XPATH, '//span[@class="a-size-medium a-color-base a-text-normal"]')
         for item in items:
             name_list.append(item.text)
-        link = driver.find_element_by_class_name("a-section.a-spacing-none.a-padding-base")
-        next_lin = link.find_element_by_class_name("a-last").find_element_by_tag_name("a").get_attribute("href")
-        driver.get(next_lin)
-        driver.implicitly_wait(2)
-
+        
+        try:
+            next_button = driver.find_element(By.CLASS_NAME, 's-pagination-next')
+            if 's-pagination-disabled' in next_button.get_attribute('class'):
+                break
+            next_button.click()
+        except:
+            break
+    return name_list
 
 names = ['Laptop', 'Phones', 'Printers', 'Desktops', 'Monitors', 'Mouse', 'Pendrive', 'Earphones', 'Smart TV', 'Power banks']
-name_list = []
-for i in names:
-    product_listing(i)
-df=pd.DataFrame(name_list)
-df.to_csv('./prod_listings.csv')
+all_product_listings = []
+
+for name in names:
+    all_product_listings.extend(product_listing(name))
+
+# Convert the list to a DataFrame and save it as a CSV file
+df = pd.DataFrame(all_product_listings, columns=['Product Name'])
+df.to_csv('./prod_listings.csv', index=False)
 print(df)
-driver.quit()
 
+driver.quit()