1
- from selenium . webdriver . common . by import By
2
- from selenium . webdriver . common . keys import Keys
3
- from selenium . webdriver . support . ui import WebDriverWait
4
- from selenium .webdriver .support import expected_conditions as EC
1
+ import streamlit as st
2
+ from selenium import webdriver
3
+ from webdriver_manager . chrome import ChromeDriverManager
4
+ from selenium .webdriver .chrome . service import Service
5
5
import time
6
6
import csv
7
7
import re
8
8
from bs4 import BeautifulSoup
9
- from selenium .webdriver .chrome .options import Options
10
- from selenium import webdriver
9
+ import os
10
+
11
+ # Function to scrape IMDb data
12
+ def scrape_imdb_data ():
13
+ options = webdriver .ChromeOptions ()
14
+ options .add_argument ('--no-sandbox' )
15
+ options .add_argument ('--disable-dev-shm-usage' )
16
+ options .add_argument ('--headless' ) # Run Chrome in headless mode
17
+
18
+ service = Service (ChromeDriverManager ().install ())
19
+ driver = webdriver .Chrome (options = options , service = service )
20
+
21
+ driver .get ('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31' )
22
+ driver .set_script_timeout (10000 )
11
23
12
- DRIVER_PATH = 'E:/chromedriver-win64/chromedriver'
13
- # Initialize the Chrome driver
14
-
15
-
16
- options = webdriver .ChromeOptions ()
17
- options .add_argument ('--no-sandbox' )
18
- options .add_argument ('--disable-dev-shm-usage' )
19
- driver = webdriver .Chrome (options = options ,executable_path = DRIVER_PATH )
20
-
21
- # Navigate to the URL
22
- driver .get ('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31' )
23
-
24
- driver .set_script_timeout (10000 )
25
- def load_more_results ():
26
- try :
27
- load_more_button = WebDriverWait (driver , 10 ).until (
28
- EC .element_to_be_clickable ((By .XPATH , '//button[contains(@class, "ipc-see-more__button")]' ))
29
- )
30
- driver .execute_script ("arguments[0].scrollIntoView(true);" , load_more_button )
31
- driver .execute_script ("arguments[0].click();" , load_more_button )
32
- time .sleep (2 )
33
- return True
34
- except Exception as e :
35
- print (f"Error: { e } " )
36
- return False
37
- def save_to_csv (movies , filename = 'movies.csv' ):
38
- keys = movies [0 ].keys ()
39
- with open (filename , 'a' , newline = '' , encoding = 'utf-8' ) as output_file :
40
- dict_writer = csv .DictWriter (output_file , fieldnames = keys )
41
- dict_writer .writeheader ()
42
- dict_writer .writerows (movies )
43
-
44
-
45
- all_movies = []
46
- cnt = 0
47
- while (cnt < 300 ):
48
- cnt += 1
49
- print (cnt )
50
- if not load_more_results ():
24
+ def load_more_results ():
25
+ try :
26
+ load_more_button = WebDriverWait (driver , 10 ).until (
27
+ EC .element_to_be_clickable ((By .XPATH , '//button[contains(@class, "ipc-see-more__button")]' ))
28
+ )
29
+ driver .execute_script ("arguments[0].scrollIntoView(true);" , load_more_button )
30
+ driver .execute_script ("arguments[0].click();" , load_more_button )
31
+ time .sleep (2 )
32
+ return True
33
+ except Exception as e :
34
+ print (f"Error: { e } " )
35
+ return False
36
+
37
+ def save_to_csv (movies , filename = 'movies.csv' ):
38
+ file_exists = os .path .isfile (filename )
39
+ keys = movies [0 ].keys ()
40
+ with open (filename , 'a' , newline = '' , encoding = 'utf-8' ) as output_file :
41
+ dict_writer = csv .DictWriter (output_file , fieldnames = keys )
42
+ if not file_exists :
43
+ dict_writer .writeheader ()
44
+ dict_writer .writerows (movies )
45
+
46
+ all_movies = []
47
+ cnt = 0
48
+ while cnt < 300 :
49
+ cnt += 1
50
+ if not load_more_results ():
51
51
break
52
-
53
- movie_elements = driver .find_element (By .XPATH , "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul" )
54
- print ("movie_list" )
55
-
56
- html_content = movie_elements .get_attribute ('outerHTML' )
57
- print ("html movie_list" )
58
- soup = BeautifulSoup (html_content , 'html.parser' )
59
-
60
- lst = soup .find_all ("li" , class_ = "ipc-metadata-list-summary-item" )
61
- print ("list" )
62
- for i in lst :
63
- org_title = i .find ("h3" ,class_ = "ipc-title__text" ).text
64
- try :
65
- title = re .sub (r'\d+\.\s*' , '' , org_title )
66
- except :
67
- title = "NA"
68
- try :
69
- year = i .find ("span" , class_ = "sc-b189961a-8 kLaxqf dli-title-metadata-item" ).text
52
+
53
+ movie_elements = driver .find_elements (By .XPATH , "//div[contains(@class, 'lister-item mode-advanced')]" )
70
54
71
- except :
72
- year = "NA"
73
- try :
74
- rating = i .find ("span" , class_ = 'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating' ).text .split ()[0 ]
75
- except :
76
- rating = "NA"
77
- try :
78
- description = i .find ("div" , class_ = 'ipc-html-content-inner-div' ).text
79
- except :
80
- description = "NA"
81
- all_movies .append ({
82
- 'title' : title ,
83
- 'type' :"Tv-Series" ,
84
- 'year' : year ,
85
- 'rating' : rating ,
86
- 'description' : description
87
- })
88
-
89
- print ("saving started" )
90
- if all_movies :
91
- save_to_csv (all_movies )
92
- print ("completed" )
93
- driver .quit ()
55
+ for element in movie_elements :
56
+ soup = BeautifulSoup (element .get_attribute ('outerHTML' ), 'html.parser' )
57
+
58
+ try :
59
+ org_title = soup .find ("h3" , class_ = "lister-item-header" ).find ("a" ).text
60
+ title = re .sub (r'\d+\.\s*' , '' , org_title )
61
+ except :
62
+ title = "NA"
63
+
64
+ try :
65
+ year = soup .find ("span" , class_ = "lister-item-year" ).text
66
+ except :
67
+ year = "NA"
68
+
69
+ try :
70
+ rating = soup .find ("div" , class_ = "ratings-bar" ).find ("strong" ).text
71
+ except :
72
+ rating = "NA"
73
+
74
+ try :
75
+ description = soup .find_all ("p" , class_ = "text-muted" )[1 ].text .strip ()
76
+ except :
77
+ description = "NA"
78
+
79
+ all_movies .append ({
80
+ 'title' : title ,
81
+ 'type' : "Tv-Series" ,
82
+ 'year' : year ,
83
+ 'rating' : rating ,
84
+ 'description' : description
85
+ })
86
+
87
+ if all_movies :
88
+ save_to_csv (all_movies )
89
+ all_movies = []
90
+
91
+ driver .quit ()
92
+
93
+ # Streamlit App
94
+ def main ():
95
+ st .title ("IMDb Scraper" )
96
+
97
+ if st .button ("Scrape IMDb Data" ):
98
+ with st .spinner ("Scraping IMDb data..." ):
99
+ scrape_imdb_data ()
100
+ st .success ("Data scraped successfully!" )
101
+
102
+ # Show the CSV file content
103
+ st .subheader ("Scraped IMDb Data:" )
104
+ filename = 'movies.csv'
105
+ if os .path .exists (filename ):
106
+ with open (filename , 'r' , encoding = 'utf-8' ) as file :
107
+ csv_content = file .read ()
108
+ st .code (csv_content , language = 'csv' )
109
+ else :
110
+ st .error ("CSV file not found." )
111
+
112
+ if __name__ == "__main__" :
113
+ main ()
0 commit comments