2
2
import random
3
3
import pandas as pd
4
4
from selenium import webdriver
5
- from selenium .webdriver .chrome .service import Service
5
+ from selenium .webdriver .chrome .service import Service
6
6
from selenium .webdriver .chrome .options import Options
7
7
from selenium .webdriver .common .by import By
8
8
from selenium .webdriver .support .ui import WebDriverWait
9
9
from selenium .webdriver .support import expected_conditions as EC
10
- from selenium .common .exceptions import TimeoutException , NoSuchElementException , ElementNotInteractableException
10
+ from selenium .common .exceptions import TimeoutException , NoSuchElementException , ElementNotInteractableException , StaleElementReferenceException
11
11
from fake_useragent import UserAgent
12
12
from bs4 import BeautifulSoup
13
- from selenium .webdriver .chrome .service import Service
14
- from selenium .webdriver .chrome .options import Options
15
- from selenium import webdriver
13
+ import re
16
14
import os
17
15
16
+ # Define constants for readability
17
+ MAX_PAGES = 300
18
+ LOAD_MORE_TIMEOUT = 10
19
+ SCRAPE_TIMEOUT = 10
20
+ WAIT_TIME = random .uniform (1 , 3 )
21
+
22
+ # Create a custom exception for scraping errors
23
+ class ScrapingError (Exception ):
24
+ pass
25
+
18
26
def load_more_results (driver ):
19
27
try :
20
- load_more_button = WebDriverWait (driver , 10 ).until (
28
+ load_more_button = WebDriverWait (driver , LOAD_MORE_TIMEOUT ).until (
21
29
EC .element_to_be_clickable ((By .XPATH , "//button[contains(@class, 'ipc-see-more__button')] or contains(@class, 'next-page')]" ))
22
30
)
23
31
driver .execute_script ("arguments[0].scrollIntoView(true);" , load_more_button )
24
32
driver .execute_script ("arguments[0].click();" , load_more_button )
25
- time .sleep (random . uniform ( 1 , 3 ) )
33
+ time .sleep (WAIT_TIME )
26
34
return True
27
35
except (NoSuchElementException , ElementNotInteractableException , TimeoutException ) as e :
28
36
print (f"Error loading more results: { e } " )
29
37
return False
30
38
31
39
def scrape_movie_data (driver ):
32
40
all_movies = []
33
- movie_elements = driver .find_element (By .XPATH , "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul" )
34
- html_content = movie_elements .get_attribute ('outerHTML' )
35
- soup = BeautifulSoup (html_content , 'html.parser' )
36
- lst = soup .find_all ("li" , class_ = "ipc-metadata-list-summary-item" )
37
- for i in lst :
38
- try :
39
- org_title = i .find ("h3" , class_ = "ipc-title__text" ).text
40
- title = re .sub (r'\d+\.\s*' , '' , org_title )
41
- except :
42
- title = "NA"
43
- try :
44
- year = i .find ("span" , class_ = "sc-b189961a-8 kLaxqf dli-title-metadata-item" ).text
45
- except :
46
- year = "NA"
47
- try :
48
- rating = i .find ("span" , class_ = 'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating' ).text .split ()[0 ]
49
- except :
50
- rating = "NA"
51
- try :
52
- description = i .find ("div" , class_ = 'ipc-html-content-inner-div' ).text
53
- except :
54
- description = "NA"
41
+ try :
42
+ # Wait for the movie list to be present
43
+ movie_list_container = WebDriverWait (driver , SCRAPE_TIMEOUT ).until (
44
+ EC .presence_of_element_located ((By .XPATH , "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul" ))
45
+ )
46
+
47
+ # Wait for the individual movie elements within the list to be present
48
+ movie_elements = WebDriverWait (driver , SCRAPE_TIMEOUT ).until (
49
+ EC .presence_of_all_elements_located ((By .XPATH , "//li[@class='ipc-metadata-list-summary-item']" ))
50
+ )
51
+
52
+ for movie_element in movie_elements :
53
+ try :
54
+ org_title = movie_element .find_element (By .XPATH , ".//h3[@class='ipc-title__text']" ).text
55
+ title = re .sub (r'\d+\.\s*' , '' , org_title )
56
+ except :
57
+ title = "NA"
58
+ try :
59
+ year = movie_element .find_element (By .XPATH , ".//span[@class='sc-b189961a-8 kLaxqf dli-title-metadata-item']" ).text
60
+ except :
61
+ year = "NA"
62
+ try :
63
+ rating = movie_element .find_element (By .XPATH , ".//span[@class='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating']" ).text .split ()[0 ]
64
+ except :
65
+ rating = "NA"
66
+ try :
67
+ description = movie_element .find_element (By .XPATH , ".//div[@class='ipc-html-content-inner-div']" ).text
68
+ except :
69
+ description = "NA"
55
70
56
- all_movies .append ({
57
- 'title' : title ,
58
- 'type' : "Tv-Series" ,
59
- 'year' : year ,
60
- 'rating' : rating ,
61
- 'description' : description
62
- })
71
+ all_movies .append ({
72
+ 'title' : title ,
73
+ 'type' : "Tv-Series" ,
74
+ 'year' : year ,
75
+ 'rating' : rating ,
76
+ 'description' : description
77
+ })
78
+ except (NoSuchElementException , ElementNotInteractableException , TimeoutException , StaleElementReferenceException ) as e :
79
+ print (f"Error scraping movie data: { e } " )
80
+ raise ScrapingError ("Error during scraping." )
63
81
return all_movies
64
82
65
83
def main ():
@@ -74,15 +92,25 @@ def main():
74
92
service = Service (executable_path = driver_path )
75
93
driver = webdriver .Chrome (service = service , options = options )
76
94
driver .get ('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31' )
95
+
77
96
all_movies_data = []
78
- while True :
79
- movies_data = scrape_movie_data (driver )
80
- all_movies_data .extend (movies_data )
97
+ cnt = 0
98
+ while cnt < MAX_PAGES :
99
+ cnt += 1
100
+ print (f"Scraping page { cnt } " )
101
+ try :
102
+ movies_data = scrape_movie_data (driver )
103
+ all_movies_data .extend (movies_data )
104
+ except ScrapingError :
105
+ print ("Encountered a scraping error. Skipping page." )
106
+ continue
81
107
if not load_more_results (driver ):
82
108
break
109
+
83
110
df = pd .DataFrame (all_movies_data )
84
111
df .to_csv ('imdb_movies.csv' , index = False )
85
112
driver .quit ()
86
113
print ("Data scraped and saved to 'imdb_movies.csv'" )
114
+
87
115
if __name__ == "__main__" :
88
116
main ()
0 commit comments