1
+ import time
2
+ import random
3
+ import pandas as pd
4
+ from selenium import webdriver
5
+ from selenium .webdriver .chrome .service import Service
6
+ from selenium .webdriver .chrome .options import Options
7
+ from selenium .webdriver .common .by import By
8
+ from selenium .webdriver .support .ui import WebDriverWait
9
+ from selenium .webdriver .support import expected_conditions as EC
10
+ from selenium .common .exceptions import TimeoutException , NoSuchElementException , ElementNotInteractableException
11
+ from fake_useragent import UserAgent
12
+ from bs4 import BeautifulSoup
13
+ from selenium .webdriver .chrome .service import Service
14
+ from selenium .webdriver .chrome .options import Options
15
+ from selenium import webdriver
16
+ import os
17
+
18
+ def load_more_results (driver ):
19
+ try :
20
+ load_more_button = WebDriverWait (driver , 10 ).until (
21
+ EC .element_to_be_clickable ((By .XPATH , "//button[contains(@class, 'ipc-see-more__button')] or contains(@class, 'next-page')]" ))
22
+ )
23
+ driver .execute_script ("arguments[0].scrollIntoView(true);" , load_more_button )
24
+ driver .execute_script ("arguments[0].click();" , load_more_button )
25
+ time .sleep (random .uniform (1 , 3 ))
26
+ return True
27
+ except (NoSuchElementException , ElementNotInteractableException , TimeoutException ) as e :
28
+ print (f"Error loading more results: { e } " )
29
+ return False
30
+
31
+ def scrape_movie_data (driver ):
32
+ all_movies = []
33
+ movie_elements = driver .find_element (By .XPATH , "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul" )
34
+ html_content = movie_elements .get_attribute ('outerHTML' )
35
+ soup = BeautifulSoup (html_content , 'html.parser' )
36
+ lst = soup .find_all ("li" , class_ = "ipc-metadata-list-summary-item" )
37
+ for i in lst :
38
+ try :
39
+ org_title = i .find ("h3" , class_ = "ipc-title__text" ).text
40
+ title = re .sub (r'\d+\.\s*' , '' , org_title )
41
+ except :
42
+ title = "NA"
43
+ try :
44
+ year = i .find ("span" , class_ = "sc-b189961a-8 kLaxqf dli-title-metadata-item" ).text
45
+ except :
46
+ year = "NA"
47
+ try :
48
+ rating = i .find ("span" , class_ = 'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating' ).text .split ()[0 ]
49
+ except :
50
+ rating = "NA"
51
+ try :
52
+ description = i .find ("div" , class_ = 'ipc-html-content-inner-div' ).text
53
+ except :
54
+ description = "NA"
55
+
56
+ all_movies .append ({
57
+ 'title' : title ,
58
+ 'type' : "Tv-Series" ,
59
+ 'year' : year ,
60
+ 'rating' : rating ,
61
+ 'description' : description
62
+ })
63
+ return all_movies
64
+
65
+ def main ():
66
+ # Replace with the actual path to your chromedriver executable
67
+ driver_path = "C:\chromedriver-win64\chromedriver-win64\chromedriver.exe"
68
+ user_data_dir = os .path .join (os .path .expanduser ('~' ), 'AppData/Local/Google/Chrome/User Data/Default' )
69
+ options = Options ()
70
+ options .add_argument ('--no-sandbox' )
71
+ options .add_argument ('--disable-dev-shm-usage' )
72
+ options .add_argument ('--headless' )
73
+ options .add_argument (f'user-data-dir={ user_data_dir } ' )
74
+ service = Service (executable_path = driver_path )
75
+ driver = webdriver .Chrome (service = service , options = options )
76
+ driver .get ('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31' )
77
+ all_movies_data = []
78
+ while True :
79
+ movies_data = scrape_movie_data (driver )
80
+ all_movies_data .extend (movies_data )
81
+ if not load_more_results (driver ):
82
+ break
83
+ df = pd .DataFrame (all_movies_data )
84
+ df .to_csv ('imdb_movies.csv' , index = False )
85
+ driver .quit ()
86
+ print ("Data scraped and saved to 'imdb_movies.csv'" )
87
+ if __name__ == "__main__" :
88
+ main ()
0 commit comments