Skip to content

Commit ae71049

Browse files
authored
Merge pull request #162 from Harshitmishra001/main
Adding Updated_Scrapper.py
2 parents 4c83bc7 + 34220e8 commit ae71049

File tree

1 file changed

+116
-0
lines changed

1 file changed

+116
-0
lines changed

Web_app/Updated_Scrapper.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import time
2+
import random
3+
import pandas as pd
4+
from selenium import webdriver
5+
from selenium.webdriver.chrome.service import Service
6+
from selenium.webdriver.chrome.options import Options
7+
from selenium.webdriver.common.by import By
8+
from selenium.webdriver.support.ui import WebDriverWait
9+
from selenium.webdriver.support import expected_conditions as EC
10+
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException, StaleElementReferenceException
11+
from fake_useragent import UserAgent
12+
from bs4 import BeautifulSoup
13+
import re
14+
import os
15+
16+
# Define constants for readability
17+
MAX_PAGES = 300
18+
LOAD_MORE_TIMEOUT = 10
19+
SCRAPE_TIMEOUT = 10
20+
WAIT_TIME = random.uniform(1, 3)
21+
22+
# Create a custom exception for scraping errors
23+
class ScrapingError(Exception):
24+
pass
25+
26+
def load_more_results(driver):
27+
try:
28+
load_more_button = WebDriverWait(driver, LOAD_MORE_TIMEOUT).until(
29+
EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ipc-see-more__button')] or contains(@class, 'next-page')]"))
30+
)
31+
driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
32+
driver.execute_script("arguments[0].click();", load_more_button)
33+
time.sleep(WAIT_TIME)
34+
return True
35+
except (NoSuchElementException, ElementNotInteractableException, TimeoutException) as e:
36+
print(f"Error loading more results: {e}")
37+
return False
38+
39+
def scrape_movie_data(driver):
40+
all_movies = []
41+
try:
42+
# Wait for the movie list to be present
43+
movie_list_container = WebDriverWait(driver, SCRAPE_TIMEOUT).until(
44+
EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul"))
45+
)
46+
47+
# Wait for the individual movie elements within the list to be present
48+
movie_elements = WebDriverWait(driver, SCRAPE_TIMEOUT).until(
49+
EC.presence_of_all_elements_located((By.XPATH, "//li[@class='ipc-metadata-list-summary-item']"))
50+
)
51+
52+
for movie_element in movie_elements:
53+
try:
54+
org_title = movie_element.find_element(By.XPATH, ".//h3[@class='ipc-title__text']").text
55+
title = re.sub(r'\d+\.\s*', '', org_title)
56+
except:
57+
title = "NA"
58+
try:
59+
year = movie_element.find_element(By.XPATH, ".//span[@class='sc-b189961a-8 kLaxqf dli-title-metadata-item']").text
60+
except:
61+
year = "NA"
62+
try:
63+
rating = movie_element.find_element(By.XPATH, ".//span[@class='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating']").text.split()[0]
64+
except:
65+
rating = "NA"
66+
try:
67+
description = movie_element.find_element(By.XPATH, ".//div[@class='ipc-html-content-inner-div']").text
68+
except:
69+
description = "NA"
70+
71+
all_movies.append({
72+
'title': title,
73+
'type': "Tv-Series",
74+
'year': year,
75+
'rating': rating,
76+
'description': description
77+
})
78+
except (NoSuchElementException, ElementNotInteractableException, TimeoutException, StaleElementReferenceException) as e:
79+
print(f"Error scraping movie data: {e}")
80+
raise ScrapingError("Error during scraping.")
81+
return all_movies
82+
83+
def main():
84+
# Replace with the actual path to your chromedriver executable
85+
driver_path = "C:\chromedriver-win64\chromedriver-win64\chromedriver.exe"
86+
user_data_dir = os.path.join(os.path.expanduser('~'), 'AppData/Local/Google/Chrome/User Data/Default')
87+
options = Options()
88+
options.add_argument('--no-sandbox')
89+
options.add_argument('--disable-dev-shm-usage')
90+
options.add_argument('--headless')
91+
options.add_argument(f'user-data-dir={user_data_dir}')
92+
service = Service(executable_path=driver_path)
93+
driver = webdriver.Chrome(service=service, options=options)
94+
driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31')
95+
96+
all_movies_data = []
97+
cnt = 0
98+
while cnt < MAX_PAGES:
99+
cnt += 1
100+
print(f"Scraping page {cnt}")
101+
try:
102+
movies_data = scrape_movie_data(driver)
103+
all_movies_data.extend(movies_data)
104+
except ScrapingError:
105+
print("Encountered a scraping error. Skipping page.")
106+
continue
107+
if not load_more_results(driver):
108+
break
109+
110+
df = pd.DataFrame(all_movies_data)
111+
df.to_csv('imdb_movies.csv', index=False)
112+
driver.quit()
113+
print("Data scraped and saved to 'imdb_movies.csv'")
114+
115+
if __name__ == "__main__":
116+
main()

0 commit comments

Comments
 (0)