Skip to content

Commit 34220e8

Browse files
Update Updated_Scrapper.py
1 parent 6b84edf commit 34220e8

File tree

1 file changed

+67
-39
lines changed

1 file changed

+67
-39
lines changed

Web_app/Updated_Scrapper.py

Lines changed: 67 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,64 +2,82 @@
22
import random
33
import pandas as pd
44
from selenium import webdriver
5-
from selenium.webdriver.chrome.service import Service
5+
from selenium.webdriver.chrome.service import Service
66
from selenium.webdriver.chrome.options import Options
77
from selenium.webdriver.common.by import By
88
from selenium.webdriver.support.ui import WebDriverWait
99
from selenium.webdriver.support import expected_conditions as EC
10-
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException
10+
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException, StaleElementReferenceException
1111
from fake_useragent import UserAgent
1212
from bs4 import BeautifulSoup
13-
from selenium.webdriver.chrome.service import Service
14-
from selenium.webdriver.chrome.options import Options
15-
from selenium import webdriver
13+
import re
1614
import os
1715

16+
# Define constants for readability
17+
MAX_PAGES = 300
18+
LOAD_MORE_TIMEOUT = 10
19+
SCRAPE_TIMEOUT = 10
20+
WAIT_TIME = random.uniform(1, 3)
21+
22+
# Create a custom exception for scraping errors
23+
class ScrapingError(Exception):
24+
pass
25+
1826
def load_more_results(driver):
1927
try:
20-
load_more_button = WebDriverWait(driver, 10).until(
28+
load_more_button = WebDriverWait(driver, LOAD_MORE_TIMEOUT).until(
2129
EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ipc-see-more__button')] or contains(@class, 'next-page')]"))
2230
)
2331
driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
2432
driver.execute_script("arguments[0].click();", load_more_button)
25-
time.sleep(random.uniform(1, 3))
33+
time.sleep(WAIT_TIME)
2634
return True
2735
except (NoSuchElementException, ElementNotInteractableException, TimeoutException) as e:
2836
print(f"Error loading more results: {e}")
2937
return False
3038

3139
def scrape_movie_data(driver):
3240
all_movies = []
33-
movie_elements = driver.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul")
34-
html_content = movie_elements.get_attribute('outerHTML')
35-
soup = BeautifulSoup(html_content, 'html.parser')
36-
lst = soup.find_all("li", class_="ipc-metadata-list-summary-item")
37-
for i in lst:
38-
try:
39-
org_title = i.find("h3", class_="ipc-title__text").text
40-
title = re.sub(r'\d+\.\s*', '', org_title)
41-
except:
42-
title = "NA"
43-
try:
44-
year = i.find("span", class_="sc-b189961a-8 kLaxqf dli-title-metadata-item").text
45-
except:
46-
year = "NA"
47-
try:
48-
rating = i.find("span", class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text.split()[0]
49-
except:
50-
rating = "NA"
51-
try:
52-
description = i.find("div", class_='ipc-html-content-inner-div').text
53-
except:
54-
description = "NA"
41+
try:
42+
# Wait for the movie list to be present
43+
movie_list_container = WebDriverWait(driver, SCRAPE_TIMEOUT).until(
44+
EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul"))
45+
)
46+
47+
# Wait for the individual movie elements within the list to be present
48+
movie_elements = WebDriverWait(driver, SCRAPE_TIMEOUT).until(
49+
EC.presence_of_all_elements_located((By.XPATH, "//li[@class='ipc-metadata-list-summary-item']"))
50+
)
51+
52+
for movie_element in movie_elements:
53+
try:
54+
org_title = movie_element.find_element(By.XPATH, ".//h3[@class='ipc-title__text']").text
55+
title = re.sub(r'\d+\.\s*', '', org_title)
56+
except:
57+
title = "NA"
58+
try:
59+
year = movie_element.find_element(By.XPATH, ".//span[@class='sc-b189961a-8 kLaxqf dli-title-metadata-item']").text
60+
except:
61+
year = "NA"
62+
try:
63+
rating = movie_element.find_element(By.XPATH, ".//span[@class='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating']").text.split()[0]
64+
except:
65+
rating = "NA"
66+
try:
67+
description = movie_element.find_element(By.XPATH, ".//div[@class='ipc-html-content-inner-div']").text
68+
except:
69+
description = "NA"
5570

56-
all_movies.append({
57-
'title': title,
58-
'type': "Tv-Series",
59-
'year': year,
60-
'rating': rating,
61-
'description': description
62-
})
71+
all_movies.append({
72+
'title': title,
73+
'type': "Tv-Series",
74+
'year': year,
75+
'rating': rating,
76+
'description': description
77+
})
78+
except (NoSuchElementException, ElementNotInteractableException, TimeoutException, StaleElementReferenceException) as e:
79+
print(f"Error scraping movie data: {e}")
80+
raise ScrapingError("Error during scraping.")
6381
return all_movies
6482

6583
def main():
@@ -74,15 +92,25 @@ def main():
7492
service = Service(executable_path=driver_path)
7593
driver = webdriver.Chrome(service=service, options=options)
7694
driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31')
95+
7796
all_movies_data = []
78-
while True:
79-
movies_data = scrape_movie_data(driver)
80-
all_movies_data.extend(movies_data)
97+
cnt = 0
98+
while cnt < MAX_PAGES:
99+
cnt += 1
100+
print(f"Scraping page {cnt}")
101+
try:
102+
movies_data = scrape_movie_data(driver)
103+
all_movies_data.extend(movies_data)
104+
except ScrapingError:
105+
print("Encountered a scraping error. Skipping page.")
106+
continue
81107
if not load_more_results(driver):
82108
break
109+
83110
df = pd.DataFrame(all_movies_data)
84111
df.to_csv('imdb_movies.csv', index=False)
85112
driver.quit()
86113
print("Data scraped and saved to 'imdb_movies.csv'")
114+
87115
if __name__ == "__main__":
88116
main()

0 commit comments

Comments
 (0)