1
+ import time
2
+ import random
3
+ import pandas as pd
4
+ from selenium import webdriver
5
+ from selenium .webdriver .chrome .service import Service
6
+ from selenium .webdriver .chrome .options import Options
7
+ from selenium .webdriver .common .by import By
8
+ from selenium .webdriver .support .ui import WebDriverWait
9
+ from selenium .webdriver .support import expected_conditions as EC
10
+ from selenium .common .exceptions import TimeoutException , NoSuchElementException , ElementNotInteractableException , StaleElementReferenceException
11
+ from fake_useragent import UserAgent
12
+ from bs4 import BeautifulSoup
13
+ import re
14
+ import os
15
+
16
+ # Define constants for readability
17
+ MAX_PAGES = 300
18
+ LOAD_MORE_TIMEOUT = 10
19
+ SCRAPE_TIMEOUT = 10
20
+ WAIT_TIME = random .uniform (1 , 3 )
21
+
22
+ # Create a custom exception for scraping errors
23
+ class ScrapingError (Exception ):
24
+ pass
25
+
26
+ def load_more_results (driver ):
27
+ try :
28
+ load_more_button = WebDriverWait (driver , LOAD_MORE_TIMEOUT ).until (
29
+ EC .element_to_be_clickable ((By .XPATH , "//button[contains(@class, 'ipc-see-more__button')] or contains(@class, 'next-page')]" ))
30
+ )
31
+ driver .execute_script ("arguments[0].scrollIntoView(true);" , load_more_button )
32
+ driver .execute_script ("arguments[0].click();" , load_more_button )
33
+ time .sleep (WAIT_TIME )
34
+ return True
35
+ except (NoSuchElementException , ElementNotInteractableException , TimeoutException ) as e :
36
+ print (f"Error loading more results: { e } " )
37
+ return False
38
+
39
+ def scrape_movie_data (driver ):
40
+ all_movies = []
41
+ try :
42
+ # Wait for the movie list to be present
43
+ movie_list_container = WebDriverWait (driver , SCRAPE_TIMEOUT ).until (
44
+ EC .presence_of_element_located ((By .XPATH , "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul" ))
45
+ )
46
+
47
+ # Wait for the individual movie elements within the list to be present
48
+ movie_elements = WebDriverWait (driver , SCRAPE_TIMEOUT ).until (
49
+ EC .presence_of_all_elements_located ((By .XPATH , "//li[@class='ipc-metadata-list-summary-item']" ))
50
+ )
51
+
52
+ for movie_element in movie_elements :
53
+ try :
54
+ org_title = movie_element .find_element (By .XPATH , ".//h3[@class='ipc-title__text']" ).text
55
+ title = re .sub (r'\d+\.\s*' , '' , org_title )
56
+ except :
57
+ title = "NA"
58
+ try :
59
+ year = movie_element .find_element (By .XPATH , ".//span[@class='sc-b189961a-8 kLaxqf dli-title-metadata-item']" ).text
60
+ except :
61
+ year = "NA"
62
+ try :
63
+ rating = movie_element .find_element (By .XPATH , ".//span[@class='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating']" ).text .split ()[0 ]
64
+ except :
65
+ rating = "NA"
66
+ try :
67
+ description = movie_element .find_element (By .XPATH , ".//div[@class='ipc-html-content-inner-div']" ).text
68
+ except :
69
+ description = "NA"
70
+
71
+ all_movies .append ({
72
+ 'title' : title ,
73
+ 'type' : "Tv-Series" ,
74
+ 'year' : year ,
75
+ 'rating' : rating ,
76
+ 'description' : description
77
+ })
78
+ except (NoSuchElementException , ElementNotInteractableException , TimeoutException , StaleElementReferenceException ) as e :
79
+ print (f"Error scraping movie data: { e } " )
80
+ raise ScrapingError ("Error during scraping." )
81
+ return all_movies
82
+
83
+ def main ():
84
+ # Replace with the actual path to your chromedriver executable
85
+ driver_path = "C:\chromedriver-win64\chromedriver-win64\chromedriver.exe"
86
+ user_data_dir = os .path .join (os .path .expanduser ('~' ), 'AppData/Local/Google/Chrome/User Data/Default' )
87
+ options = Options ()
88
+ options .add_argument ('--no-sandbox' )
89
+ options .add_argument ('--disable-dev-shm-usage' )
90
+ options .add_argument ('--headless' )
91
+ options .add_argument (f'user-data-dir={ user_data_dir } ' )
92
+ service = Service (executable_path = driver_path )
93
+ driver = webdriver .Chrome (service = service , options = options )
94
+ driver .get ('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31' )
95
+
96
+ all_movies_data = []
97
+ cnt = 0
98
+ while cnt < MAX_PAGES :
99
+ cnt += 1
100
+ print (f"Scraping page { cnt } " )
101
+ try :
102
+ movies_data = scrape_movie_data (driver )
103
+ all_movies_data .extend (movies_data )
104
+ except ScrapingError :
105
+ print ("Encountered a scraping error. Skipping page." )
106
+ continue
107
+ if not load_more_results (driver ):
108
+ break
109
+
110
+ df = pd .DataFrame (all_movies_data )
111
+ df .to_csv ('imdb_movies.csv' , index = False )
112
+ driver .quit ()
113
+ print ("Data scraped and saved to 'imdb_movies.csv'" )
114
+
115
+ if __name__ == "__main__" :
116
+ main ()
0 commit comments